def __init__(self, verbose=True): if verbose: logger.debug('Build Multilayer Perceptron Ranking model...') # Positive input setting self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative input setting self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Standard input setting self.inputL = T.matrix(name='inputL', dtype=floatX) self.inputR = T.matrix(name='inputR', dtype=floatX) # Build activation function self.act = Activation('tanh') # Connect input matrices self.inputP = T.concatenate([self.inputPL, self.inputPR], axis=1) self.inputN = T.concatenate([self.inputNL, self.inputNR], axis=1) self.input = T.concatenate([self.inputL, self.inputR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.input, (2*edim, args.hidden), act=self.act) self.hidden = self.hidden_layer.output self.hiddenP = self.hidden_layer.encode(self.inputP) self.hiddenN = self.hidden_layer.encode(self.inputN) # Dropout parameter #srng = T.shared_randomstreams.RandomStreams(args.seed) #mask = srng.binomial(n=1, p=1-args.dropout, size=self.hidden.shape) #maskP = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenP.shape) #maskN = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenN.shape) #self.hidden *= T.cast(mask, floatX) #self.hiddenP *= T.cast(maskP, floatX) #self.hiddenN *= T.cast(maskN, floatX) # Build linear output layer self.score_layer = ScoreLayer(self.hidden, args.hidden) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.hiddenP) self.scoreN = self.score_layer.encode(self.hiddenN) # Stack all the parameters self.params = [] self.params += self.hidden_layer.params self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0-self.scoreP+self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Count the total number of parameters in this model self.num_params = edim * args.hidden + args.hidden + args.hidden + 1 # Build class method self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) if verbose: logger.debug('Architecture of MLP Ranker built finished, summarized below: ') logger.debug('Input dimension: %d' % edim) logger.debug('Hidden dimension: %d' % args.hidden) logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def __init__(self, random_generator, theano_random_generator=None, x_dim=28 * 28, y_dim=10, hidden_layer_sizes=[500, 500], corruption_levels=[0.1, 0.1]): """ """ # Declare empty sigmoid layer array for MLP self.sigmoid_layers = [] # Declare an empty array of DenoisingAutoEncoder self.autoencoder_layers = [] self.params = [] self.n_layers = len(hidden_layer_sizes) if theano_random_generator == None: self.theano_random_generator = RandomStreams( random_generator.randint(2**30)) else: self.theano_random_generator = theano_random_generator # Inputs using Theano self.x = T.matrix("x") self.y = T.ivector("y") # Initialize all parameters for i in range(self.n_layers): # Define x and y dimensions if i == 0: internal_x_dim = x_dim else: internal_x_dim = hidden_layer_sizes[i - 1] internal_y_dim = hidden_layer_sizes[i] # Find inputs if i == 0: internal_input = self.x else: internal_input = self.sigmoid_layers[i - 1].output # Define Sigmoid Layer self.sigmoid_layers.append( HiddenLayer(internal_input, internal_x_dim, internal_y_dim, random_generator, activation=T.nnet.sigmoid)) # Define input self.autoencoder_layers.append( DenoisingAutoEncoder(random_generator, theano_random_generator, internal_x_dim, internal_y_dim, internal_input, W=self.sigmoid_layers[i].W, b=self.sigmoid_layers[i].b)) # Uppdate parameters self.params.extend(self.sigmoid_layers[i].params) # Finally add logistic layer self.logistic_layer = LogisticRegression( self.sigmoid_layers[-1].output, hidden_layer_sizes[-1], y_dim) self.params.extend(self.logistic_layer.params) # These are two important costs # Finetuning after pretraining individual AutoEncoders self.finetune_cost = self.logistic_layer.negative_log_likelihood( self.y) # Error from prediction self.error = self.logistic_layer.error(self.y)
rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 8, 8), filter_shape=(nkerns[1], nkerns[0], 4, 4), poolsize=(1, 1), stride=(1, 1), W=params[4].get_value(), b=params[5].get_value(), ) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer( rng, inputs=layer2_input, n_in=nkerns[1] * 5 * 5, n_out=500, activation=T.tanh, W=params[2].get_value(), b=params[3].get_value(), ) layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=36, W=params[0].get_value(), b=params[1].get_value()) forward = theano.function( inputs=[input], outputs=layer3.p_y_given_x, on_unused_input='warn',
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=100, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options # emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_noMT_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_labels, word2id = load_il10_NI_test( word2id, maxSentLen) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) test_labels = np.asarray(test_labels, dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} # word2vec=load_fasttext_multiple_word2vec_given_file([emb_root+'100k-ENG-multicca.300.ENG.txt',emb_root+'100k-HIN-multicca.d300.HIN.txt',emb_root+'100k-IL10-multicca.d300.IL10.txt'], 300) word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-IL10-cca.d100.eng.txt', emb_root + '100k-IL10-cca.d100.IL10.txt' ], 100) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng,PV, kind =2,theano_rng=None, n_ins=784,h_activation = [], hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.PV = theano.shared(value=PV,borrow=True) self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels self.z1 = T.matrix('z1') self.z2 = T.matrix('z2') # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output activation = None if h_activation[i] == 1: activation = T.nnet.sigmoid if h_activation[i] == 2: activation = T.tanh sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a output layer on top of the MLP self.OutLayer = HiddenLayer(rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs,activation=T.nnet.sigmoid, kind=2) self.params.extend(self.OutLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.OutLayer.sq_loss(self.z1,self.z2) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.OutLayer.errors(self.y) self.p_y_given_x = self.OutLayer.output def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, true_out, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] true_out = theano.shared(value=true_out,borrow=True) assert self.PV.get_value().shape[0] == train_set_x.get_value().shape[0] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=[self.finetune_cost,self.p_y_given_x], updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.z1: self.PV[index * batch_size: (index + 1) * batch_size], self.z2: true_out[index * batch_size: (index + 1) * batch_size] } ) test_score_i = theano.function( [index], [self.errors,self.p_y_given_x], givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] } ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] } ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): score_i = [] p_y_given_x = [] for i in xrange(n_test_batches): tem = test_score_i(i) score_i.append(tem[0]) p_y_given_x.append(tem[1]) return [score_i,p_y_given_x] return train_fn, valid_score, test_score
class BRNNMatchScorer(object): ''' Bidirectional RNN for text matching as a classification problem. ''' def __init__(self, config, verbose=True): # Construct two BRNNEncoders for matching two sentences self.encoderL = BRNNEncoder(config, verbose) self.encoderR = BRNNEncoder(config, verbose) # Link two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Set up input # Note that there are three kinds of inputs altogether, including: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This pair is used for training positive pairs # 3, inputNL, inputNR. This pair is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Get output of two BRNNEncoders self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive Hidden self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative Hidden self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=0) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=0) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=0) # Build hidden layer self.hidden_layer = HiddenLayer(self.hidden, (4*config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layer self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the total number of parameters in the model self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class functions self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) # Compute the gradient of the objective function and cost and prediction self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) # Output function for debugging purpose self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) if verbose: logger.debug('Architecture of BRNNMatchScorer built finished, summarized below: ') logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension of RNN: %d' % config.num_hidden) logger.debug('Hidden dimension of MLP: %d' % config.num_mlp) logger.debug('There are 2 BRNNEncoders used in the model.') logger.debug('Total number of parameters in this model: %d' % self.num_params) def update_params(self, grads, learn_rate): ''' @grads: [np.ndarray]. List of numpy.ndarray for updating the model parameters. They are the corresponding gradients of model parameters. @learn_rate: scalar. Learning rate. ''' for param, grad in zip(self.params, grads): p = param.get_value(borrow=True) param.set_value(p - learn_rate * grad, borrow=True) def set_params(self, params): ''' @params: [np.ndarray]. List of numpy.ndarray to set the model parameters. ''' for p, param in zip(self.params, params): p.set_value(param, borrow=True) def deepcopy(self, brnn): ''' @brnn: BRNNMatchScorer. Copy the model parameters of another BRNNMatchScorer. ''' assert len(self.params) == len(brnn.params) for p, param in zip(self.params, brnn.params): val = param.get_value() p.set_value(val) @staticmethod def save(fname, model): ''' @fname: String. Filename to store the model. @model: BRNNMatcher. An instance of BRNNMatcher to be saved. ''' with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): ''' @fname: String. Filename to load the model. ''' with file(fname, 'rb') as fin: model = cPickle.load(fin) return model
def buildLayers(layer0_input, batch_size, dim, nkerns, rng, TT=None): # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) W0 = None b0 = None W1 = None b1 = None W2 = None b2 = None W3 = None b3 = None W4 = None b4 = None W5 = None b5 = None if TT != None: W0 = TT.Layer0_param.W.get_value(borrow=True) b0 = TT.Layer0_param.b.get_value(borrow=True) W1 = TT.Layer1_param.W.get_value(borrow=True) b1 = TT.Layer1_param.b.get_value(borrow=True) W2 = TT.Layer2_param.W.get_value(borrow=True) b2 = TT.Layer2_param.b.get_value(borrow=True) W3 = TT.Layer3_param.W.get_value(borrow=True) b3 = TT.Layer3_param.b.get_value(borrow=True) W4 = TT.Layer4_param.W.get_value(borrow=True) b4 = TT.Layer4_param.b.get_value(borrow=True) W5 = TT.Layer5_param.W.get_value(borrow=True) b5 = TT.Layer5_param.b.get_value(borrow=True) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, dim, 128, 128), filter_shape=(nkerns[0], dim, 5, 5), poolsize=(2, 2), Wi=W0, bi=b0) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 62, 62), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), Wi=W1, bi=b1) layer2 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 29, 29), filter_shape=(nkerns[2], nkerns[1], 6, 6), poolsize=(2, 2), Wi=W2, bi=b2) #output 12*12 # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 12 * 12, n_out=1024, Wi=W3, bi=b3) layer4 = HiddenLayer(rng, input=layer3.output, n_in=1024, n_out=2048, Wi=W4, bi=b4) # classify the values of the fully-connected sigmoidal layer layer5 = HiddenLayer(rng, input=layer4.output, n_in=2048, n_out=51, Wi=W5, bi=b5) return [layer0, layer1, layer2, layer3, layer4, layer5]
def __init__( self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1], name_appendage = '' ): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid, name_appendage=name_appendage+'_sigmoid_'+str(i)) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b, name_appendage=name_appendage+'_dA_'+str(i)) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs ) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def test_SdA_regress(finetune_lr=0.05, pretraining_epochs=10, pretrain_lr=0.1, training_epochs=10000, dataset='mnist.pkl.gz', batch_size=20): datasets = load_data_half(dataset) train_set_x, train_set_y = datasets[0]## valid_set_x, valid_set_y = datasets[1]## test_set_x, test_set_y = datasets[2]## train_set_x=train_set_x.eval() train_set_y=train_set_y.eval() import theano train_set_x_lab=train_set_x[:,:] train_set_x_unlab=train_set_x[:,:] train_set_y_lab=train_set_y[:,:] train_set_y_unlab=train_set_y[:,:] train_set_x_lab=theano.shared(numpy.asarray(train_set_x_lab, dtype=theano.config.floatX), borrow=True) train_set_y_lab=theano.shared(numpy.asarray(train_set_y_lab, dtype=theano.config.floatX), borrow=True) train_set_x_unlab=theano.shared(numpy.asarray(train_set_x_unlab, dtype=theano.config.floatX), borrow=True) train_set_y_unlab=theano.shared(numpy.asarray(train_set_y_unlab, dtype=theano.config.floatX), borrow=True) # compute number of minibatches for training, validation and testing n_train_batches_l = train_set_y_lab.eval().shape[0] n_train_batches_l /= batch_size n_train_batches_u = train_set_y_unlab.eval().shape[0] n_train_batches_u /= batch_size # compute number of minibatches for training, validation and testing #n_train_batches = train_set_x.get_value(borrow=True).shape[0] #n_train_batches /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) print '... building the model' # construct the stacked denoising autoencoder class #from SdA_orig import SdA as SdA_old hidden_layer_size = 100 SdA_inp = SdA(numpy_rng, n_ins=392, hidden_layers_sizes=[hidden_layer_size] ) SdA_out = SdA(numpy_rng, n_ins=392, hidden_layers_sizes=[hidden_layer_size] ) # PRETRAINING THE MODEL # if 0 : # pretrain inp ae print '... getting the pretraining functions for INPUT AE' pretraining_fns = SdA_inp.pretraining_functions(train_set_x=train_set_x_unlab, batch_size=batch_size) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise corruption_levels = [.1, .2, .3] for i in xrange(SdA_inp.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches_u): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if 0 : # pretrain out ae print '... getting the pretraining functions for OUTPUT AE' pretraining_fns = SdA_out.pretraining_functions(train_set_x=train_set_y_unlab, batch_size=batch_size) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise corruption_levels = [.5, .2, .3] for i in xrange(SdA_out.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches_u): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) if 0: # save aes f=open('aes_shallow_sig_nobias.pkl', 'w+') import pickle pickle.dump(SdA_inp, f) pickle.dump(SdA_out, f) f.flush() f.close() if 0: # load aes f=open('aes_shallow_sig_nobias.pkl', 'r') import pickle SdA_inp=pickle.load(f) SdA_out=pickle.load(f) f.close() if 1: # cca from dcca_numpy import netCCA_nobias, netCCA, dCCA from mlp_numpy import expit, logistic_prime, linear, linear_prime, relu, relu_prime, tanh, tanh_prime train_y1 = train_set_x_lab.eval() train_y2 = train_set_y_lab.eval() test_y1 = test_set_x.eval() test_y2 = test_set_y.eval() ##param1=((train_y1.shape[1],0,0),(2038, relu, relu_prime),(50, relu, relu_prime)) ##param2=((train_y2.shape[1],0,0),(1608, relu, relu_prime),(50, relu, relu_prime)) param1=((train_y1.shape[1],0,0),(hidden_layer_size, expit, logistic_prime)) param2=((train_y2.shape[1],0,0),(hidden_layer_size, expit, logistic_prime)) W1s = [] b1s = [] for i in range(len(SdA_inp.dA_layers)): W1s.append( SdA_inp.dA_layers[i].W.T.eval() ) ##b1s.append( SdA_inp.dA_layers[i].b.eval() ) ##b1s[-1] = b1s[-1].reshape((b1s[-1].shape[0], 1)) W2s = [] b2s = [] for i in range(len(SdA_out.dA_layers)): W2s.append( SdA_out.dA_layers[i].W.T.eval() ) ##b2s.append( SdA_out.dA_layers[i].b.eval() ) ##b2s[-1] = b2s[-1].reshape((b2s[-1].shape[0], 1)) numpy.random.seed(0) N1=netCCA_nobias(train_y1,param1, W1s) N2=netCCA_nobias(train_y2,param2, W2s) N = dCCA(train_y1, train_y2, N1, N2) N1.reconstruct(test_set_x.eval()[0,:]) cnt = 0 from dcca_numpy import cca_cost, cca, order_cost, cor_cost while True: X=N1.predict(test_set_x.eval()) Y=N2.predict(test_set_y.eval()) _H1 = numpy.dot(X, N.A1) _H2 = numpy.dot(Y, N.A2) print '****', cnt, cor_cost(_H1, _H2) X1_rec = numpy.tanh(X.dot(N1.weights[0])) X2_rec = numpy.tanh(Y.dot(N2.weights[0])) param=((hidden_layer_size,0,0),(hidden_layer_size, relu, relu_prime)) from mlp_numpy import NeuralNetwork as NN lr=NN(X,Y,param) lr.train(X[:,:],Y[:,:],10, 0.005) Yh=lr.predict(X[:,:]) X2_reg = N2.fs[-1](numpy.dot(Yh,N2.weights[0])) #X2_reg = N2.fs[-1](numpy.dot(_H1.dot(numpy.linalg.inv(N.A1)),N2.weights[0])) print '****', 'mse1:', numpy.mean((X1_rec-test_set_x.eval())**2.0) print '****', 'mse2:', numpy.mean((X2_rec-test_set_y.eval())**2.0) print '****', 'mse_map:', numpy.mean((X2_reg-test_set_y.eval())**2.0) if cnt % 2: N.train(5, True, 10000.0) else: N.train(5, False, 10000.0) cnt += 1 f=open('netcca.pkl', 'w+') import pickle pickle.dump(N, f) pickle.dump(N, f) f.flush() f.close() if cnt == 200: break for i in range(len(SdA_inp.dA_layers)): SdA_inp.dA_layers[i].W = theano.shared( N1.weights[i].T ) SdA_inp.dA_layers[i].b = theano.shared( N1.biases[i][:,0] ) for i in range(len(SdA_out.dA_layers)): SdA_out.dA_layers[i].W = theano.shared( N2.weights[i].T ) SdA_out.dA_layers[i].b = theano.shared( N2.weights[i][:,0] ) if 1 : # pretrain middle layer print '... pre-training MIDDLE layer' h1 = T.matrix('x') # the data is presented as rasterized images h2 = T.matrix('y') # the labels are presented as 1D vector of log_reg = HiddenLayer(numpy_rng, h1, hidden_layer_size, hidden_layer_size) if 1: # for middle layer learning_rate = 0.01 fprop_inp = theano.function( [], SdA_inp.sigmoid_layers[-1].output, givens={ SdA_inp.sigmoid_layers[0].input: train_set_x_lab }, name='fprop_inp' ) fprop_out = theano.function( [], SdA_out.sigmoid_layers[-1].output, givens={ SdA_out.sigmoid_layers[0].input: train_set_y_lab }, name='fprop_out' ) #H11=fprop_inp() #H21=fprop_out() ##H1=N1.predict(train_set_x.eval()) ##H2=N2.predict(train_set_y.eval()) H1=fprop_inp() H2=fprop_out() H1=theano.shared(H1) H2=theano.shared(H2) # compute the gradients with respect to the model parameters logreg_cost = log_reg.mse(h2) gparams = T.grad(logreg_cost, log_reg.params) # compute list of fine-tuning updates updates = [ (param, param - gparam * learning_rate) for param, gparam in zip(log_reg.params, gparams) ] train_fn_middle = theano.function( inputs=[], outputs=logreg_cost, updates=updates, givens={ h1: H1, h2: H2 }, name='train_middle' ) epoch = 0 while epoch < 10: print epoch, train_fn_middle() epoch += 1 sda = SdA_regress( SdA_inp, SdA_out, log_reg, numpy_rng=numpy_rng, n_inp=28*28//2, hidden_layers_sizes_inp=[hidden_layer_size], hidden_layers_sizes_out=[hidden_layer_size], n_out=28*28//2 ) # end-snippet-3 start-snippet-4 # end-snippet-4 # FINETUNING THE MODEL # # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, validate_model, test_model = sda.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr ) print '... finetunning the model' # early-stopping parameters patience = 10 * n_train_batches_l # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches_l, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 fprop = theano.function( [], sda.sigmoid_layers[-1].output, givens={ sda.x: test_set_x }, name='fprop' ) while True: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches_l): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches_l + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches_l, this_validation_loss )) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches_l, test_score )) if patience <= iter: done_looping = True #break if 0: # vis weights fprop = theano.function( [], sda.sigmoid_layers[-1].output, givens={ sda.x: test_set_x }, name='fprop' ) yh=fprop() yh=yh end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss , best_iter + 1, test_score) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def classify_lenet5(batch_size=500, output_size=20): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 37, 23)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 37, 23), filter_shape=(20, 1, 4, 2), poolsize=(2, 2), ) # layer1 = LeNetConvPoolLayer( # rng, # input=layer0.output, # image_shape=(batch_size, 20, 17, 11), # filter_shape=(50, 20, 4, 2), # poolsize=(2, 2), # ) # # layer4 = LeNetConvPoolLayer( # rng, # input=layer1.output, # image_shape=(batch_size, 50, 7, 5), # filter_shape=(100, 50, 4, 2), # poolsize=(2, 2), # ) layer2_input = layer0.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=3740, n_out=output_size, activation=T.tanh, use_bias=True ) # layer5 = HiddenLayer( # rng, # input=layer2.output, # n_in=200, # n_out=output_size, # activation=T.tanh, # use_bias=True # ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=output_size, n_out=2) model_params = pickle.load(open('../model/cnn_dist_'+str(output_size)+'.pkl')) # layer0.W = theano.shared( value=numpy.array( model_params[2].get_value(True), dtype=theano.config.floatX ), name='W', borrow=True ) layer0.b = theano.shared( value=numpy.array( model_params[3].get_value(True), dtype=theano.config.floatX ), name='b', borrow=True ) # layer1.W = theano.shared( # value=numpy.array( # model_params[-4].get_value(True), # dtype=theano.config.floatX # ), # name='W', # borrow=True # ) # # layer1.b = theano.shared( # value=numpy.array( # model_params[-3].get_value(True), # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) # # layer4.W = theano.shared( # value=numpy.array( # model_params[-6].get_value(True), # dtype=theano.config.floatX # ), # name='W', # borrow=True # ) # # layer4.b = theano.shared( # value=numpy.array( # model_params[-5].get_value(True), # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) layer2.W = theano.shared( value=numpy.array( model_params[0].get_value(True), dtype=theano.config.floatX ), name='W', borrow=True ) layer2.b = theano.shared( value=numpy.array( model_params[1].get_value(True), dtype=theano.config.floatX ), name='b', borrow=True ) # layer5.W = theano.shared( # value=numpy.array( # model_params[-10].get_value(True), # dtype=theano.config.floatX # ), # name='W', # borrow=True # ) # # layer5.b = theano.shared( # value=numpy.array( # model_params[-9].get_value(True), # dtype=theano.config.floatX # ), # name='b', # borrow=True # ) layer3.W = theano.shared( value=numpy.array( model_params[4].get_value(True), dtype=theano.config.floatX ), name='W', borrow=True ) layer3.b = theano.shared( value=numpy.array( model_params[5].get_value(True), dtype=theano.config.floatX ), name='b', borrow=True ) # params = layer3.params + layer5.params + layer2.params + layer4.params + layer1.params + layer0.params datasets = load_data(None) sets = ['train', 'dev', 'test'] dimension = [20000, 20000, 20000] for k in range(3): if k == 0: classify_set_x, classify_set_y, classify_set_z, classify_set_m, classify_set_c, classify_set_b= datasets[k] else: classify_set_x, classify_set_y, classify_set_z= datasets[k] # compute number of minibatches for training, validation and testing n_classify_batches = classify_set_x.get_value(borrow=True).shape[0] n_classify_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch classify = theano.function( [index], layer2.output, givens={ x: classify_set_x[index * batch_size: (index + 1) * batch_size], } ) r = [] for i in xrange(n_classify_batches): m = classify(i) r.extend(m) r = np.array(r) print r.shape r = np.append(r, np.reshape(classify_set_y.eval(),(dimension[k], 1)), 1) numpy.savetxt('../extractedInformation/cnn_dist_'+str(output_size)+'/'+sets[k]+'.csv', r, delimiter=",")
def evaluate_lenet5(learning_rate=0.05, n_epochs=10, nkerns=[20, 50], batch_size=50): global train_dataset_route global valid_dataset_route global train_limit global valid_limit print train_dataset_route, type(train_dataset_route) """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data.load_spc_data(train_dataset_route, valid_dataset_route, train_limit, valid_limit) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (100, 100) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 100, 100)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 100, 100), filter_shape=(nkerns[0], 1, 40, 40), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 30, 30), filter_shape=(nkerns[1], nkerns[0], 15, 15), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 8 * 8, n_out=100, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=100, n_out=2) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model ''' test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) test_results = theano.function(inputs=[index], outputs= layer3.y_pred, givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) ''' validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter , ' patience = ' , patience cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter layer0_state = layer0.__getstate__() layer1_state = layer1.__getstate__() layer2_state = layer2.__getstate__() layer3_state = layer3.__getstate__() trained_model_list = [layer0_state, layer1_state, layer2_state, layer3_state] trained_model_array = numpy.asarray(trained_model_list) classifier_file = open(train_model_route, 'w') cPickle.dump([1,2,3], classifier_file, protocol=2) numpy.save(classifier_file, trained_model_array) classifier_file.close() if patience <= iter: done_looping = True print patience , iter break end_time = time.clock() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
def evaluate_lenet5(learning_rate=0.005, n_epochs=5,data = None,nkerns= 64, batch_size=30): #for i in range(len(x_val)): #if len(x_val[i]) == 490 and len(x_val[i][0]) == 640: #x1.append(x_val[i]) #y1.append(y_val[i]-1) #if len(x1) == 80: #break from data_loader import load_data train, validate, test = load_data() x_train = np.array(train[0],'float32') y_train = train[1] x_valid = np.array(validate[0],'float32') y_valid = validate[1] x_test = np.array(test[0],'float32') y_test = test[1] x_train2 = theano.shared(numpy.asarray(x_train,dtype=theano.config.floatX)) y_train_2 = theano.shared(numpy.asarray(y_train,dtype=theano.config.floatX)) x_valid2 = theano.shared(numpy.asarray(x_valid,dtype=theano.config.floatX)) y_valid_2 = theano.shared(numpy.asarray(y_valid,dtype=theano.config.floatX)) x_test2 = theano.shared(numpy.asarray(x_test,dtype=theano.config.floatX)) y_test_2 = theano.shared(numpy.asarray(y_test,dtype=theano.config.floatX)) y_train2 = T.cast(y_train_2, 'int32') y_test2 = T.cast(y_test_2, 'int32') y_valid2 = T.cast(y_valid_2, 'int32') print len(x_train) print len(y_train) rng = numpy.random.RandomState(23455) n_train_batches = len(y_train)/batch_size n_valid_batches = len(y_valid)/batch_size n_test_batches = len(y_test)/batch_size index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are p layer0_input = x.reshape((batch_size, 1, 64, 64)) '''构建第一层网络: image_shape:输入大小为490*640的特征图,batch_size个训练数据,每个训练数据有1个特征图 filter_shape:卷积核个数为nkernes=64,因此本层每个训练样本即将生成64个特征图 经过卷积操作,图片大小变为(490-7+1 , 640-7+1) = (484, 634) 经过池化操作,图片大小变为 (484/2, 634/2) = (242, 317) 最后生成的本层image_shape为(batch_size, nklearn, 242, 317)''' layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 64, 64), filter_shape=(nkerns, 1, 7, 7), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns * 7 * 7), # (100, 64*7*7) with the default values. layer2_input = layer0.output.flatten(2) '''全链接:输入layer2_input是一个二维的矩阵,第一维表示样本,第二维表示上面经过卷积下采样后 每个样本所得到的神经元,也就是每个样本的特征,HiddenLayer类是一个单层网络结构 下面的layer2把神经元个数由800个压缩映射为500个''' layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns * 29 * 29, n_out=500, activation=T.tanh ) layer2.output = dropout_layer(layer2.output,0.5) # 最后一层:逻辑回归层分类判别,把500个神经元,压缩映射成10个神经元,分别对应于手写字体的0~9 layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=8) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ y: y_test2[index * batch_size: (index + 1) * batch_size], x: x_test2[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: x_valid2[index * batch_size: (index + 1) * batch_size], y: y_valid2[index * batch_size: (index + 1) * batch_size] } ) #把所有的参数放在同一个列表里,可直接使用列表相加 params = layer3.params + layer2.params + layer0.params #梯度求导 grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: x_train2[index * batch_size: (index + 1) * batch_size], y: y_train2[index * batch_size: (index + 1) * batch_size] } ) print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.2 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): #while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches):#每一批训练数据 cost_ij = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break with open('param0.pkl', 'wb') as f0: pickle.dump(layer0.params, f0) f0.close() with open('param2.pkl', 'wb') as f2: pickle.dump(layer2.params, f2) f2.close() with open('param3.pkl', 'wb') as f3: pickle.dump(layer3.params, f3) f3.close() end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng,PV, kind =2,theano_rng=None, n_ins=784,h_activation = [], hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.PV = theano.shared(value=PV,borrow=True) self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels self.z1 = T.matrix('z1') self.z2 = T.matrix('z2') # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output activation = None if h_activation[i] == 1: activation = T.nnet.sigmoid if h_activation[i] == 2: activation = T.tanh sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a output layer on top of the MLP self.OutLayer = HiddenLayer(rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs,activation=T.nnet.sigmoid, kind=2) self.params.extend(self.OutLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.OutLayer.sq_loss(self.z1,self.z2) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.OutLayer.errors(self.y) self.p_y_given_x = self.OutLayer.output
def run(): preProcess = PreProcess() data = preProcess.run() train_set_x,train_set_y = data[0],data[3] valid_set_x,valid_set_y = data[1],data[4]#data[1],data[4] test_set_x,test_set_y = data[2],data[5] # network parameters num_kernels = [10,10] kernel_sizes = [(9, 9), (5, 5)] #exit() # training parameters learning_rate = 0.005 batch_size = 50 n_sports = np.max(train_set_y.eval())+1 sigmoidal_output_size = 20 if valid_set_y.eval().size<batch_size: print 'Error: Batch size is larger than size of validation set.' # Setup 2: compute batch sizes for train/test/validation # borrow=True gets us the value of the variable without making a copy. n_train_batches = train_set_x.get_value(borrow=True).shape[1] n_test_batches = test_set_x.get_value(borrow=True).shape[1] n_valid_batches = valid_set_x.get_value(borrow=True).shape[1] n_train_batches /= batch_size n_test_batches /= batch_size n_valid_batches /= batch_size # Setup 3. # Declare inputs to network - x and y are placeholders # that will be used in the training/testing/validation functions below. x = T.tensor3('x') # input image data y = T.ivector('y') # input label data # ## Layer 0 - First convolutional Layer # The first layer takes **`(batch_size, 1, 28, 28)`** as input, convolves it with **10** different **9x9** filters, and then downsamples (via maxpooling) in a **2x2** region. Each filter/maxpool combination produces an output of size **`(28-9+1)/2 = 10`** on a side. # The size of the first layer's output is therefore **`(batch_size, 10, 10, 10)`**. class Convolution(object): def __init__(self,batch_size,num_kernels,kernel_sizes,channel): self.layer0_input_size = (batch_size, 1, 100, 100) # fixed size from the data self.edge0 = (100 - kernel_sizes[0][0] + 1) / 2 self.layer0_output_size = (batch_size, num_kernels[0], self.edge0, self.edge0) # check that we have an even multiple of 2 before pooling assert ((100 - kernel_sizes[0][0] + 1) % 2) == 0 # The actual input is the placeholder x reshaped to the input size of the network self.layer0_input = x[channel].reshape(self.layer0_input_size) self.layer0 = LeNetConvPoolLayer(rng, input=self.layer0_input, image_shape=self.layer0_input_size, subsample= (1,1), filter_shape=(num_kernels[0], 1) + kernel_sizes[0], poolsize=(2, 2)) # ## Layer 1 - Second convolutional Layer # The second layer takes **`(batch_size, 10, 10, 10)`** as input, convolves it with 10 different **10x5x5** filters, and then downsamples (via maxpooling) in a **2x2** region. Each filter/maxpool combination produces an output of size **`(10-5+1)/2 = 3`** on a side. # The size of the second layer's output is therefore **`(batch_size, 10, 3, 3)`**. self.layer1_input_size = self.layer0_output_size self.edge1 = (self.edge0 - kernel_sizes[1][0] + 1) / 2 self.layer1_output_size = (batch_size, num_kernels[1], self.edge1, self.edge1) # check that we have an even multiple of 2 before pooling assert ((self.edge0 - kernel_sizes[1][0] + 1) % 2) == 0 self.layer1 = LeNetConvPoolLayer(rng, input=self.layer0.output, image_shape=self.layer1_input_size, subsample= (1,1), filter_shape=(num_kernels[1], num_kernels[0]) + kernel_sizes[1], poolsize=(2, 2)) conv = Convolution(batch_size,num_kernels,kernel_sizes,0) conv2 = Convolution(batch_size,num_kernels,kernel_sizes,1) conv3 = Convolution(batch_size,num_kernels,kernel_sizes,2) # ## Layer 2 - Fully connected sigmoidal layer #exit() # The sigmoidal layer takes a vector as input. # We flatten all but the first two dimensions, to get an input of size **`(batch_size, 30 * 4 * 4)`**. #raw_random= raw_random.RandomStreamsBase() srng = theano.tensor.shared_randomstreams.RandomStreams( rng.randint(999999)) #def rectify(X): # return T.maximum(X,0.) def dropout(X,p=0.5): if p>0: retain_prob = 1-p X *= srng.binomial(X.shape,p=retain_prob,dtype = theano.config.floatX) X /= retain_prob return X def rectify(X): return T.maximum(X,0.) layer2_input = conv.layer1.output.flatten(2) layer2_input = T.concatenate((T.concatenate((conv.layer1.output.flatten(2),conv2.layer1.output.flatten(2)),axis=1),conv2.layer1.output.flatten(2)),axis=1) layer2 = HiddenLayer(rng, input=dropout(layer2_input), n_in= num_kernels[1] * conv.edge1 * conv.edge1*3, n_out= num_kernels[1] * conv.edge1 * conv.edge1, activation=rectify) #T.tanh # EXTRA LAYER # A fully connected logistic regression layer converts the sigmoid's layer output to a class label. extra = HiddenLayer(rng, input=dropout(layer2.output), n_in= num_kernels[1] * conv.edge1 * conv.edge1, n_out=num_kernels[1] * conv.edge1 * conv.edge1, activation=rectify) #T.tanh # ## Layer 3 - Logistic regression output layer # A fully connected logistic regression layer converts the sigmoid's layer output to a class label. layer3 = LogisticRegression(input=extra.output, n_in=num_kernels[1] * conv.edge1 * conv.edge1, n_out=n_sports) # # Training the network # To train the network, we have to define a cost function. We'll use the Negative Log Likelihood of the model, relative to the true labels **`y`**. # The cost we minimize during training is the NLL of the model. # Recall: y is a placeholder we defined above cost = layer3.negative_log_likelihood(y) # ### Gradient descent # We will train with Stochastic Gradient Descent. To do so, we need the gradient of the cost relative to the parameters of the model. We can get the parameters for each label via the **`.params`** attribute. # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + conv.layer1.params + conv.layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # ## Update updates = [ (param_i, param_i - learning_rate * grad_i) # <=== SGD update step for param_i, grad_i in zip(params, grads) ] index = T.lscalar() # index to a batch of training/validation/testing examples train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[:,index * batch_size: (index + 1) * batch_size], # <=== batching y: train_set_y[index * batch_size: (index + 1) * batch_size] # <=== batching } ) # ## Validation function # To track progress on a held-out set, we count the number of misclassified examples in the validation set. validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[:,index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # ## Test function # After training, we check the number of misclassified examples in the test set. test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[:,index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) # guesses = theano.function( # [], # layer3.y_pred, # givens={ # x: test_set_x # } # ) # # Training loop # We use SGD for a fixed number of iterations over the full training set (an "epoch"). Usually, we'd use a more complicated rule, such as iterating until a certain number of epochs fail to produce improvement in the validation set. for epoch in range(90): costs = [train_model(i) for i in xrange(n_train_batches)] validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] #print layer3.return_y_pred() print "Epoch {} NLL {:.2} %err in validation set {:.1%}".format(epoch + 1, np.mean(costs), np.mean(validation_losses)) # ## Learned features #filters = tile_raster_images(layer0.W.get_value(borrow=True), img_shape=(9, 9), tile_shape=(1,10), tile_spacing=(3, 3), # scale_rows_to_unit_interval=True, # output_pixel_vals=True) #plt.imshow(filters) #plt.show() # ## Check performance on the test set test_errors = [test_model(i) for i in range(n_test_batches)] print "test errors: {:.1%}".format(np.mean(test_errors))
def evaluate_lenet5(self): #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7], # L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1): rng = numpy.random.RandomState(23455) #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainY, trainLengths, trainLeftPad, trainRightPad= self.datasets[0] indices_dev, devY, devLengths, devLeftPad, devRightPad= self.datasets[1] indices_test, testY, testLengths, testLeftPad, testRightPad= self.datasets[2] n_train_batches=indices_train.shape[0]/self.batch_size n_valid_batches=indices_dev.shape[0]/self.batch_size n_test_batches=indices_test.shape[0]/self.batch_size remain_train=indices_train.shape[0]%self.batch_size train_batch_start=[] dev_batch_start=[] test_batch_start=[] if self.useAllSamples: train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size)+[indices_train.shape[0]-self.batch_size] dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size)+[indices_dev.shape[0]-self.batch_size] test_batch_start=list(numpy.arange(n_test_batches)*self.batch_size)+[indices_test.shape[0]-self.batch_size] n_train_batches=n_train_batches+1 n_valid_batches=n_valid_batches+1 n_test_batches=n_test_batches+1 else: train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size) dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size) test_batch_start=list(numpy.arange(n_test_batches)*self.batch_size) indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True) indices_train_theano=T.cast(indices_train_theano, 'int32') indices_dev_theano=T.cast(indices_dev_theano, 'int32') indices_test_theano=T.cast(indices_test_theano, 'int32') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix('x_index') # now, x is the index matrix, must be integer #y = T.ivector('y') z = T.ivector('z') # sentence length left=T.ivector('left') right=T.ivector('right') iteration= T.lscalar() x=self.embeddings_R[x_index.flatten()].reshape((self.batch_size,self.maxSentLength, self.embedding_size)).transpose(0, 2, 1).flatten() ishape = (self.embedding_size, self.maxSentLength) # this is the size of MNIST images filter_size1=(self.embedding_size,self.filter_size[0]) filter_size2=(self.embedding_size/2,self.filter_size[1]) #poolsize1=(1, ishape[1]-filter_size1[1]+1) #????????????????????????????? poolsize1=(1, ishape[1]+filter_size1[1]-1) ''' left_after_conv=T.maximum(0,left-filter_size1[1]+1) right_after_conv=T.maximum(0, right-filter_size1[1]+1) ''' left_after_conv=left right_after_conv=right #kmax=30 # this can not be too small, like 20 #ktop=6 #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) poolsize2=(1, self.kmax+filter_size2[1]-1) #(1,6) dynamic_lengths=T.maximum(self.ktop,z/2+1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((self.batch_size, 1, ishape[0], ishape[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) ''' layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax) ''' layer0 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer0_input, image_shape=(self.batch_size, 1, ishape[0], ishape[1]), filter_shape=(self.nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=dynamic_lengths, unifiedWidth=self.kmax, left=left_after_conv, right=right_after_conv, firstLayer=True) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) ''' layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0], kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop) ''' ''' left_after_conv=T.maximum(0, layer0.leftPad-filter_size2[1]+1) right_after_conv=T.maximum(0, layer0.rightPad-filter_size2[1]+1) ''' left_after_conv=layer0.leftPad right_after_conv=layer0.rightPad dynamic_lengths=T.repeat([self.ktop],self.batch_size) # dynamic k-max pooling ''' layer1 = ConvFoldPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop, left=left_after_conv, right=right_after_conv) ''' layer1 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer0.output, image_shape=(self.batch_size, self.nkerns[0], ishape[0]/2, self.kmax), filter_shape=(self.nkerns[1], self.nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=dynamic_lengths, unifiedWidth=self.ktop, left=left_after_conv, right=right_after_conv, firstLayer=False) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) #produce sentence embeddings layer2 = HiddenLayer(rng, input=layer2_input, n_in=self.nkerns[1] * (self.embedding_size/4) * self.ktop, n_out=self.sentEm_length, activation=T.tanh) context_matrix, target_matrix=self.extract_contexts_targets(indices_matrix=x_index, sentLengths=z, leftPad=left) #note that context indices might be zero embeddings h_indices=context_matrix[:, self.context_size*iteration:self.context_size*(iteration+1)] w_indices=target_matrix[:, iteration:(iteration+1)] #r_h is the concatenation of context embeddings r_h=self.embed_context(h_indices) #(batch_size, context_size*embedding_size) q_w=self.embed_target(w_indices) #q_hat: concatenate sentence embeddings and context embeddings q_hat=self.concatenate_sent_context(layer2.output, r_h) layer3 = HiddenLayer(rng, input=q_hat, n_in=self.sentEm_length+self.context_size*self.embedding_size, n_out=self.embedding_size, activation=T.tanh) self.params = layer3.params + layer2.params+layer1.params + layer0.params+[self.embeddings_R, self.embeddings_Q] self.load_model_from_file() ''' # load parameters netfile = open('/mounts/data/proj/wenpeng/CNN_LM/model_params') for para in self.params: para.set_value(cPickle.load(netfile), borrow=True) layer0.params[0].set_value(cPickle.load(netfile), borrow=True) layer0.params[1].set_value(cPickle.load(netfile), borrow=True) layer2.params[0].set_value(cPickle.load(netfile), borrow=True) layer2.params[1].set_value(cPickle.load(netfile), borrow=True) layer3.params[0].set_value(cPickle.load(netfile), borrow=True) layer3.params[1].set_value(cPickle.load(netfile), borrow=True) ''' noise_indices, p_n_noise=self.get_noise() #noise_indices=theano.printing.Print('noise_indices')(noise_indices) s_theta_data=T.sum(layer3.output * q_w, axis=1).reshape((self.batch_size,1)) + self.bias[w_indices-1] #bias[0] should be the bias of word index 1 #s_theta_data=theano.printing.Print('s_theta_data')(s_theta_data) p_n_data = self.p_n[w_indices-1] #p_n[0] indicates the probability of word indexed 1 delta_s_theta_data = s_theta_data - T.log(self.k * p_n_data) log_sigm_data = T.log(T.nnet.sigmoid(delta_s_theta_data)) #create the noise, q_noise has shape(self.batch_size, self.k, self.embedding_size ) q_noise = self.embed_noise(noise_indices) q_hat_res = layer3.output.reshape((self.batch_size, 1, self.embedding_size)) s_theta_noise = T.sum(q_hat_res * q_noise, axis=2) + self.bias[noise_indices-1] #(batch_size, k) delta_s_theta_noise = s_theta_noise - T.log(self.k * p_n_noise) # it should be matrix (batch_size, k) log_sigm_noise = T.log(1 - T.nnet.sigmoid(delta_s_theta_noise)) sum_noise_per_example =T.sum(log_sigm_noise, axis=1) #(batch_size, 1) # Calc objective function J = -T.mean(log_sigm_data) - T.mean(sum_noise_per_example) L2_reg = (layer3.W** 2).sum()+ (layer2.W** 2).sum()+ (layer1.W** 2).sum()+(layer0.W** 2).sum()+(self.embeddings_R**2).sum()+( self.embeddings_Q**2).sum() self.cost = J + self.L2_weight*L2_reg #cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index,iteration], [self.cost,layer2.output], givens={ x_index: indices_test_theano[index: index + self.batch_size], z: testLengths[index: index + self.batch_size], left: testLeftPad[index: index + self.batch_size], right: testRightPad[index: index + self.batch_size]}) ''' validate_model = theano.function([index,iteration], self.cost, givens={ x_index: indices_dev_theano[index: index + self.batch_size], z: devLengths[index: index + self.batch_size], left: devLeftPad[index: index + self.batch_size], right: devRightPad[index: index + self.batch_size]}) # create a list of all model parameters to be fit by gradient descent #self.params = layer3.params + layer2.params+layer1.params + layer0.params+[self.embeddings_R, self.embeddings_Q] #params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator=[] for para_i in self.params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(self.cost, self.params) updates = [] for param_i, grad_i, acc_i in zip(self.params, grads, accumulator): acc = acc_i + T.sqr(grad_i) if param_i == self.embeddings_R or param_i == self.embeddings_Q: updates.append((param_i, T.set_subtensor((param_i - self.ini_learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(self.embedding_size))))) #AdaGrad else: updates.append((param_i, param_i - self.ini_learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,iteration], [self.cost, self.params], updates=updates, givens={ x_index: indices_train_theano[index: index + self.batch_size], z: trainLengths[index: index + self.batch_size], left: trainLeftPad[index: index + self.batch_size], right: trainRightPad[index: index + self.batch_size]}) ''' ############### # TRAIN MODEL # ############### print '... testing' start_time = time.clock() test_losses=[] i=0 for batch_start in test_batch_start: i=i+1 sys.stdout.write( "Progress :[%3f] %% complete!\r" % (i*100.0/len(test_batch_start)) ) sys.stdout.flush() #print str(i*100.0/len(test_batch_start))+'%...' total_iteration=max(self.test_lengths[batch_start: batch_start + self.batch_size]) #for test, we need the cost among all the iterations in that batch for iteration in range(total_iteration): cost_i, sentEm=test_model(batch_start, iteration) test_losses.append(cost_i) #test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print 'Test over, average test loss:'+str(test_score) ''' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(20, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False vali_loss_list=[] while (epoch < self.n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 total_iteration=max(self.train_lengths[batch_start: batch_start + self.batch_size]) # we only care the last cost within those iterations cost_of_end_batch=0.0 for iteration in range(total_iteration): cost_of_end_batch, params_of_end_batch = train_model(batch_start, iteration) #total_cost=total_cost+cost_ij #if iter ==1: # exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' cost: '+str(cost_of_end_batch)# +' error: '+str(error_ij) if iter % validation_frequency == 0: # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses=[] for batch_start in dev_batch_start: total_iteration=max(self.dev_lengths[batch_start: batch_start + self.batch_size]) #for validate, we need the cost among all the iterations in that batch for iteration in range(total_iteration): validation_losses.append(validate_model(batch_start, iteration)) this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation cost %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) if this_validation_loss < minimal_of_list(vali_loss_list): del vali_loss_list[:] vali_loss_list.append(this_validation_loss) #store params self.best_params=params_of_end_batch elif len(vali_loss_list)<self.vali_cost_list_length: vali_loss_list.append(this_validation_loss) if len(vali_loss_list)==self.vali_cost_list_length: self.store_model_to_file() print 'Training over, best model got at vali_cost:'+str(vali_loss_list[0]) exit(0) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses=[] for batch_start in test_batch_start: total_iteration=max(self.test_lengths[batch_start: batch_start + self.batch_size]) #for test, we need the cost among all the iterations in that batch for iteration in range(total_iteration): cost_i, sentEm=test_model(batch_start, iteration) test_losses.append(cost_i) #test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break ''' end_time = time.clock() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def evaluate(init_learning_rate=0.1, n_epochs=200, datasets='Trace' ,nkerns=[256, 256], n_train_batch=10, trans='euc', active_func=T.tanh, window_size = 0.2, ada_flag = False, pool_factor = 2, slice_ratio = 1 ): rng = numpy.random.RandomState(23455) #set random seed learning_rate = theano.shared(numpy.asarray(init_learning_rate,dtype=theano.config.floatX)) #used for learning_rate decay train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] ori_len = datasets[3] slice_ratio = datasets[4] valid_num = valid_set_x.shape[0] increase_num = ori_len - int(ori_len * slice_ratio) + 1 #this can be used as the bath size print "increase factor is ", increase_num, ', ori len', ori_len valid_num_batch = valid_num / increase_num test_num = test_set_x.shape[0] test_num_batch = test_num / increase_num length_train = train_set_x.shape[1] #length after slicing. num_of_categories = int(train_set_y.max()) + 1 window_size = int(length_train * window_size) if window_size < 1 else int(window_size) #*******set up the ma and ds********# ma_base,ma_step,ma_num = 5, 6, 0 ds_base,ds_step, ds_num = 2, 1, 4 ds_num_max = length_train / (pool_factor * window_size) ds_num = min(ds_num, ds_num_max) #*******set up the ma and ds********# (ma_train, ma_valid, ma_test , ma_lengths) = batch_movingavrg(train_set_x, valid_set_x, test_set_x, ma_base, ma_step, ma_num) (ds_train, ds_valid, ds_test , ds_lengths) = batch_downsample(train_set_x, valid_set_x, test_set_x, ds_base, ds_step, ds_num) #concatenate directly data_lengths = [length_train] #downsample part: if ds_lengths != []: data_lengths += ds_lengths train_set_x = numpy.concatenate([train_set_x, ds_train], axis = 1) valid_set_x = numpy.concatenate([valid_set_x, ds_valid], axis = 1) test_set_x = numpy.concatenate([test_set_x, ds_test], axis = 1) #moving average part if ma_lengths != []: data_lengths += ma_lengths train_set_x = numpy.concatenate([train_set_x, ma_train], axis = 1) valid_set_x = numpy.concatenate([valid_set_x, ma_valid], axis = 1) test_set_x = numpy.concatenate([test_set_x, ma_test], axis = 1) train_set_x, train_set_y = shared_dataset(train_set_x, train_set_y) valid_set_x = shared_data_x(valid_set_x) test_set_x = shared_data_x(test_set_x) #compute number of minibatches for training, validation and testing n_train_size = train_set_x.get_value(borrow=True).shape[0] n_valid_size = valid_set_x.get_value(borrow=True).shape[0] n_test_size = test_set_x.get_value(borrow=True).shape[0] batch_size = n_train_size / n_train_batch n_train_batches = n_train_size / batch_size data_dim = train_set_x.get_value(borrow=True).shape[1] print 'train size', n_train_size, ',valid size', n_valid_size, ' test size', n_test_size print 'batch size ', batch_size print 'n_train_batches is ', n_train_batches print 'data dim is ', data_dim print '---------------------------' # allocate symbolic variables for the data index = T.lscalar('index') # index to a [mini]batch # start-snippet-1 x = T.matrix('x') y = T.ivector('y') x_vote = T.matrix('xvote') # the data is presented as rasterized images #y_vote = T.ivector('y_vote') # the labels are presented as 1D vector of ###################### # BUILD ACTUAL MODEL # ###################### print 'building the model...' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = [] inputs = x.reshape((batch_size, 1, data_dim, 1)) layer0_input_vote = [] inputs_vote = x_vote.reshape((increase_num, 1, data_dim, 1)) ind = 0 for i in xrange(len(data_lengths)): layer0_input.append(inputs[:,:,ind : ind + data_lengths[i],:]) layer0_input_vote.append(inputs_vote[:,:,ind : ind + data_lengths[i],:]) ind += data_lengths[i] layer0 = [] layer0_vote = [] feature_map_size = 0 for i in xrange(len(layer0_input)): pool_size = (data_lengths[i] - window_size + 1) / pool_factor feature_map_size += (data_lengths[i] - window_size + 1) / pool_size layer0.append(ShapeletPoolLayer( numpy.random.RandomState(23455 + i), input=layer0_input[i], image_shape=(batch_size, 1, data_lengths[i], 1), filter_shape=(nkerns[0], 1, window_size, 1), poolsize=(pool_size , 1), trans = trans, active_func=active_func )) layer0_vote.append(ShapeletPoolLayer( numpy.random.RandomState(23455 + i), input=layer0_input_vote[i], image_shape=(increase_num, 1, data_lengths[i], 1), filter_shape=(nkerns[0], 1, window_size, 1), poolsize=(pool_size , 1), W = layer0[i].W, trans = trans, active_func=active_func )) layer1_input = layer0[0].output.flatten(2) layer1_vote_input = layer0_vote[0].output.flatten(2) for i in xrange(1, len(data_lengths)): layer1_input = T.concatenate([layer1_input, layer0[i].output.flatten(2)], axis = 1) layer1_vote_input = T.concatenate([layer1_vote_input, layer0_vote[i].output.flatten(2)], axis = 1) # construct a fully-connected sigmoidal layer layer1 = HiddenLayer( rng, input=layer1_input, n_in=nkerns[0] * feature_map_size, n_out=nkerns[1], activation=active_func, previous_layer = None ) # construct a fully-connected sigmoidal layer for prediction layer1_vote = HiddenLayer( rng, input=layer1_vote_input, n_in=nkerns[0] * feature_map_size, n_out=nkerns[1], activation=active_func, previous_layer = None, W = layer1.W, b = layer1.b ) # classify the values of the fully-connected sigmoidal layer layer2 = LogisticRegression(input=layer1.output, n_in=nkerns[1], n_out= num_of_categories , previous_layer = None) layer2_vote = LogisticRegressionVote(input=layer1_vote.output, n_in=nkerns[1], n_out= num_of_categories , previous_layer = None, W = layer2.W, b = layer2.b) # the cost we minimize during training is the NLL of the model cost = layer2.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer2_vote.prediction(), givens={ x_vote : test_set_x[index * (increase_num) : (index + 1) * (increase_num)] } ) # function for validation set. Return the prediction value validate_model = theano.function( [index], layer2_vote.prediction(), givens={ x_vote : valid_set_x[index * (increase_num) : (index + 1) * (increase_num)] } ) # create a list of all model parameters to be fit by gradient descent params = layer2.params + layer1.params for i in xrange(len(layer0_input)): params += layer0[i].params # Adagradient part grads = T.grad(cost, params) import copy G = [] for i in xrange(2 + len(layer0_input)): G.append( theano.shared( numpy.zeros(params[i].shape.eval(), dtype=theano.config.floatX ), borrow=True )) # parameter update methods if ada_flag == True: updates = [ (param_i, param_i - learning_rate * (grad_i / (T.sqrt(G_i) + 1e-5) )) for param_i, grad_i, G_i in zip(params, grads, G) ] else: updates = [ (param_i, param_i - learning_rate * grad_i ) for param_i, grad_i in zip(params, grads) ] update_G = theano.function(inputs=[index], outputs = G, updates=[(G_i, G_i + T.sqr(grad_i) ) for G_i, grad_i in zip(G,grads)], givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) reset_G = theano.function(inputs=[index],outputs = G, updates=[(G_i, grad_i - grad_i) for G_i, grad_i in zip(G,grads)], givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) #Our training function, return value: NLL cost and training error train_model = theano.function( [index], [cost, layer2.errors(y)], updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) decrease_learning_rate = theano.function(inputs=[], outputs = learning_rate, updates={learning_rate: learning_rate * 1e-4}) ############### # TRAIN MODEL # ############### print 'training...' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 best_test_iter = 0 best_test_loss = numpy.inf test_patience = 200 valid_loss = 0. test_loss = 0. start_time = time.clock() epoch = 0 done_looping = False last_train_err = 1 last_avg_err = float('inf') first_layer_prev = 0 num_no_update_epoch = 0 epoch_avg_cost = float('inf') epoch_avg_err = float('inf') while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epoch_train_err = 0. epoch_cost = 0. if ada_flag: reset_G(0) num_no_update_epoch += 1 if num_no_update_epoch == 500: break for minibatch_index in xrange(n_train_batches): iteration = (epoch - 1) * n_train_batches + minibatch_index if ada_flag: update_G(minibatch_index) [cost_ij,train_err] = train_model(minibatch_index) epoch_train_err = epoch_train_err + train_err epoch_cost = epoch_cost + cost_ij if (iteration + 1) % validation_frequency == 0: # compute zero-one loss on validation set # validation set loss valid_results = [validate_model(i) for i in xrange(valid_num_batch)] valid_losses = [] for i in xrange(valid_num_batch): y_pred = valid_results[i] label = valid_set_y[i * increase_num] unique_value, sub_ind, correspond_ind, count = numpy.unique(y_pred, True, True, True) unique_value = unique_value.tolist() curr_err = 1. if label in unique_value: target_ind = unique_value.index(label) count = count.tolist() sorted_count = sorted(count) if count[target_ind] == sorted_count[-1]: if len(sorted_count) > 1 and sorted_count[-1] == sorted_count[-2]: curr_err = 0.5 #tie else: curr_err = 0. valid_losses.append(curr_err) valid_loss = sum(valid_losses) / float(len(valid_losses)) print('...epoch %i, valid err: %.5f |' % (epoch, valid_loss)), # if we got the best validation score until now if valid_loss <= best_validation_loss: num_no_update_epoch = 0 #improve patience if loss improvement is good enough if valid_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iteration * patience_increase) # save best validation score and iteration number best_validation_loss = valid_loss best_iter = iteration # test it on the test set test_results = [test_model(i) for i in xrange(test_num_batch)] test_losses = [] for i in xrange(test_num_batch): y_pred = test_results[i] label = test_set_y[i * increase_num] unique_value, sub_ind, correspond_ind, count = numpy.unique(y_pred, True, True, True) unique_value = unique_value.tolist() curr_err = 1 if label in unique_value: target_ind = unique_value.index(label) count = count.tolist() sorted_count = sorted(count) if count[target_ind] == sorted_count[-1]: if len(sorted_count) > 1 and sorted_count[-1] == sorted_count[-2]: curr_err = 0.5 # tie else: curr_err = 0. test_losses.append(curr_err) test_loss = sum(test_losses) / float(len(test_losses)) print(('test err: %.5f |') % (test_loss)), best_test_loss = test_loss test_patience = 200 #test_patience -= 1 #if test_patience <= 0: # break if patience <= iteration: done_looping = True break epoch_avg_cost = epoch_cost/n_train_batches epoch_avg_err = epoch_train_err/n_train_batches #curr_lr = decrease_learning_rate() last_avg_err = epoch_avg_cost print ('train err %.5f, cost %.4f' %(epoch_avg_err,epoch_avg_cost)) if epoch_avg_cost == 0: break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test error: %f %%' % (best_validation_loss * 100., best_iter + 1, best_test_loss * 100.)) print('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return best_validation_loss
def runDeepLearning(): ### Loading training set and separting it into training set and testing set myDataset = Dataset() preprocess = 0 datasets = myDataset.loadTrain(preprocess) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] dataset_test = myDataset.loadTest(preprocess) test_set_x, test_set_y, test_set_y_array = dataset_test[0] # temporary solution to get the ground truth of sample out to test_set_y_array. # the reason is that after T.cast, test_set_y becomes TensorVariable, which I do not find way to output its # value...anyone can help? ### Model parameters learning_rate = 0.02 n_epochs = 3000 nkerns = [ 30, 40, 40 ] # number of kernal at each layer, current best performance is 50.0% on testing set, kernal number is [30,40,40] batch_size = 500 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (48, 48) # size of input images nClass = 7 rng = np.random.RandomState(23455) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[0])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[0]), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 22, 22), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) layer2 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(nkerns[0], nkerns[1], 9, 9), filter_shape=(nkerns[2], nkerns[1], 2, 2), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=nClass) # the cost we minimize during training is the NLL of the model cost = layer4.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) test_model = theano.function( [index], layer4.errorsLabel(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set #test_losses = [test_model(i) for i in xrange(n_test_batches)] test_output = [ test_model(i) for i in xrange(n_test_batches) ] test_losses = [item[0] for item in test_output] #test_y_gt = [label[0] for label in item[1] for item in test_output] # test_y_pred = np.array( [label for label in item[1] for item in test_output]) test_y_gt = np.array( [label for label in item[2] for item in test_output]) #test_y_pred = np.array([item[1] for item in test_output] ) ## the predicted_labels for the input ### it seems that the batchsize cannot be change in Theano.function while training model ### #test_label = reduce(lambda x,y: x+y,test_label) #print test_y_pred #print test_y_gt #print test_set_y_array errorNum = np.count_nonzero(test_y_gt - test_y_pred) errorSampleIndex = [ i for i in range(len(test_y_pred)) if test_y_pred[i] != test_set_y_array[i] ] #print errorNum, len(errorSampleIndex) test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) print((' on all test sample %f %%') % ((float(errorNum) / float(len(test_y_pred)) * 100.))) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') #TODO: write the code to save the trained model and test the trained model on test data print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # save the misclassified samples myDataset.plotSample(test_set_x.get_value(), test_set_y, [i for i in range(0, 100)])
def classifier(rng, common_input_l, common_input_r, sents_mask_l, sents_mask_r, drop_conv_W_2_pre, conv_b_2_pre, drop_conv_W_2_gate, conv_b_2_gate, drop_conv_W_2, conv_b_2, drop_conv_W_2_context, conv_b_2_context, labels): conv_layer_2_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=common_input_l, mask_matrix=sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate=drop_conv_W_2_gate, b_gate=conv_b_2_gate) conv_layer_2_gate_r = Conv_with_Mask_with_Gate( rng, input_tensor3=common_input_r, mask_matrix=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate=drop_conv_W_2_gate, b_gate=conv_b_2_gate) l_input_4_att = conv_layer_2_gate_l.output_tensor3 #conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l r_input_4_att = conv_layer_2_gate_r.output_tensor3 #conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r conv_layer_2 = Conv_for_Pair( rng, origin_input_tensor3=common_input_l, origin_input_tensor3_r=common_input_r, input_tensor3=l_input_4_att, input_tensor3_r=r_input_4_att, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=drop_conv_W_2, b=conv_b_2, W_context=drop_conv_W_2_context, b_context=conv_b_2_context) attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r HL_layer_1_input = T.concatenate([ attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2 ], axis=1) HL_layer_1_input_size = hidden_size[ 1] * 3 #+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1 HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.nnet.relu) HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], activation=T.nnet.relu) LR_input_size = HL_layer_1_input_size + 2 * hidden_size[0] U_a = create_ensemble_para( rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] LR_input = T.tanh( T.concatenate( [HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1)) layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. return loss, LR_para + HL_layer_1.params + HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors( labels)
def test_dA(learning_rate=0.01, training_epochs=15000, dataset='mnist.pkl.gz', batch_size=5, output_folder='dA_plots'): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ ##datasets = load_data(dataset) #from SdA_mapping import load_data_half #datasets = load_data_half(dataset) print 'loading data' datasets, x_mean, y_mean, x_std, y_std = load_vc() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print 'loaded data' # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x1 = T.matrix('x1') # the data is presented as rasterized images x2 = T.matrix('x2') # the data is presented as rasterized images cor_reg = T.scalar('cor_reg') if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) #da = dA_joint( #numpy_rng=rng, #theano_rng=theano_rng, #input1=x1, #input2=x2, #n_visible1=28 * 28/2, #n_visible2=28 * 28/2, #n_hidden=500 #) print 'initialize functions' da = dA_joint( numpy_rng=rng, theano_rng=theano_rng, input1=x1, input2=x2, cor_reg=cor_reg, #n_visible1=28 * 28/2, #n_visible2=28 * 28/2, n_visible1=24, n_visible2=24, n_hidden=50 ) cost, updates = da.get_cost_updates( corruption_level=0.3, learning_rate=learning_rate ) cor_reg_val = numpy.float32(5.0) train_da = theano.function( [index], cost, updates=updates, givens={ x1: train_set_x[index * batch_size: (index + 1) * batch_size], x2: train_set_y[index * batch_size: (index + 1) * batch_size] } ) fprop_x1 = theano.function( [], outputs=da.output1, givens={ x1: test_set_x }, name='fprop_x1' ) fprop_x2 = theano.function( [], outputs=da.output2, givens={ x2: test_set_y }, name='fprop_x2' ) fprop_x1t = theano.function( [], outputs=da.output1, givens={ x1: train_set_x }, name='fprop_x1' ) fprop_x2t = theano.function( [], outputs=da.output2, givens={ x2: train_set_y }, name='fprop_x2' ) rec_x1 = theano.function( [], outputs=da.rec1, givens={ x1: test_set_x }, name='rec_x1' ) rec_x2 = theano.function( [], outputs=da.rec2, givens={ x2: test_set_y }, name='rec_x2' ) fprop_x1_to_x2 = theano.function( [], outputs=da.reg, givens={ x1: test_set_x }, name='fprop_x12x2' ) updates_reg = [ (da.cor_reg, da.cor_reg+theano.shared(numpy.float32(0.1))) ] update_reg = theano.function( [], updates=updates_reg ) print 'initialize functions ended' start_time = time.clock() ############ # TRAINING # ############ print 'training started' X1=test_set_x.eval() X1 *= x_std X1 += x_mean X2=test_set_y.eval() X2 *= y_std X2 += y_mean from dcca_numpy import cor_cost # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) #cor_reg_val += 1 #da.cor_reg = theano.shared(cor_reg_val) update_reg() X1H=rec_x1() X2H=rec_x2() X1H *= x_std X1H += x_mean X2H *= y_std X2H += y_mean H1=fprop_x1() H2=fprop_x2() print 'Training epoch' print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\ numpy.mean(numpy.mean((X2H-X2)**2,1)) if epoch%5 == 2 : # pretrain middle layer print '... pre-training MIDDLE layer' H1t=fprop_x1t() H2t=fprop_x2t() h1 = T.matrix('x') # the data is presented as rasterized images h2 = T.matrix('y') # the labels are presented as 1D vector of from mlp import HiddenLayer numpy_rng = numpy.random.RandomState(89677) log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh) if 1: # for middle layer learning_rate = 0.1 #H1=theano.shared(H1) #H2=theano.shared(H2) # compute the gradients with respect to the model parameters logreg_cost = log_reg.mse(h2) gparams = T.grad(logreg_cost, log_reg.params) # compute list of fine-tuning updates updates = [ (param, param - gparam * learning_rate) for param, gparam in zip(log_reg.params, gparams) ] train_fn_middle = theano.function( inputs=[], outputs=logreg_cost, updates=updates, givens={ h1: theano.shared(H1t), h2: theano.shared(H2t) }, name='train_middle' ) epoch = 0 while epoch < 100: print epoch, train_fn_middle() epoch += 1 ##X2H=fprop_x1_to_x2() X2H=numpy.tanh(H1.dot(log_reg.W.eval())+log_reg.b.eval()) X2H=numpy.tanh(X2H.dot(da.W2_prime.eval())+da.b2_prime.eval()) X2H *= y_std X2H += y_mean print 'Regression ', numpy.mean(numpy.mean((X2H-X2)**2,1)) print 'Correlation ', cor_cost(H1, H2) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = Image.fromarray( tile_raster_images(X=da.W1.get_value(borrow=True).T, img_shape=(28, 14), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') from matplotlib import pyplot as pp pp.plot(H1[:10,:2],'b');pp.plot(H2[:10,:2],'r');pp.show() print cor
filter_shape=(128, 256, 3, 3), image_shape=(batch_size, 256, 10, 10), conv_stride=(1, 1)) conv_out4 = MyConvnetLayer(rng, input=conv_out3.output, filter_shape=(128, 128, 3, 3), image_shape=(batch_size, 128, 8, 8), conv_stride=(1, 1)) layer5_input = conv_out4.output.flatten(2) # construct a fully-connected sigmoidal layer full_5 = HiddenLayer(rng, input=layer5_input, n_in=128 * 6 * 6, n_out=256, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer full_5_softmax = LogisticRegression(input=full_5.output, n_in=256, n_out=5) weight_decay = 1e-5 momentum = 0.9 # Cost function for minibatch cost = T.mean(T.nnet.categorical_crossentropy(full_5_softmax.p_y_given_x, y)) # Concatenation of the params params = full_5_softmax.params + full_5.params + conv_out4.params + conv_out3.params + conv_out2.params + conv_out1.params # create theano function to compute filtered images train_model = theano.function( [x, y, lr],
class ExtGrCNNMatchScorer(object): ''' Extended Gated Recursive Convolutional Neural Network for matching task. The last layer of the model includes a linear layer for regression. ''' def __init__(self, config=None, verbose=True): # Construct two GrCNNEncoders for matching two sentences self.encoderL = ExtGrCNNEncoder(config, verbose) self.encoderR = ExtGrCNNEncoder(config, verbose) # Link the parameters of two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Build three kinds of inputs: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This part is used for training positive pairs # 3, inputNL, inputNR. This part is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Linking input-output mapping self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=1) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.hidden, (2*config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layers self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class methods self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) self.show_inputs = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR]) if verbose: logger.debug('Architecture of ExtGrCNNMatchScorer built finished, summarized below: ') logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension inside GrCNNMatchScorer pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension MLP: %d' % config.num_mlp) logger.debug('Number of Gating functions: %d' % config.num_gates) logger.debug('There are 2 ExtGrCNNEncoders used in model.') logger.debug('Total number of parameters used in the model: %d' % self.num_params) def update_params(self, grads, learn_rate): ''' @grads: [np.ndarray]. List of numpy.ndarray for updating the model parameters. @learn_rate: scalar. Learning rate. ''' for param, grad in zip(self.params, grads): p = param.get_value(borrow=True) param.set_value(p - learn_rate * grad, borrow=True) def set_params(self, params): ''' @params: [np.ndarray]. List of numpy.ndarray to set the model parameters. ''' for p, param in zip(self.params, params): p.set_value(param, borrow=True) def deepcopy(self, grcnn): ''' @grcnn: GrCNNMatchScorer. Copy the model parameters of another GrCNNMatchScorer and use it. ''' assert len(self.params) == len(grcnn.params) for p, param in zip(self.params, grcnn.params): val = param.get_value() p.set_value(val) @staticmethod def save(fname, model): ''' @fname: String. Filename to store the model. @model: GrCNNMatchScorer. An instance of GrCNNMatchScorer to be saved. ''' with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): ''' @fname: String. Filename to load the model. ''' with file(fname, 'rb') as fin: model = cPickle.load(fin) return model
def __init__(self, input_data, n_in, hidden_layer= 100, n_out = 4, weights = None,act_func = T.nnet.sigmoid, filename = None): print "Linear_Regression: From RL_METHODS" """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ self.input = input_data W1 = b1 = W2 = b2 = None print weights if weights != None: try: W1, b1 = weights except Exception as e: W1 = weights[0] #self.linearRegression = LinearRegression(self.sigmoid_layer.output, n_in = hidden_layer, n_out = n_out) numpy_rng = numpy.random.RandomState() self.linearRegression = HiddenLayer(rng=numpy_rng, input= self.input, n_in= n_in, n_out = n_out, W_values = W1, b_values = b1, activation=act_func) ''' self.linearRegression2 = HiddenLayer(rng=numpy_rng, input= self.linearRegression.output, n_in= hidden_layer, n_out = n_out, W_values = W2, b_values = b2, activation=None) ''' self.L1 = abs(self.linearRegression.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = T.mean(self.linearRegression.W ** 2) self.params = self.linearRegression.params self.output = self.linearRegression.output self.cost = self.linearRegression.mse if filename != None: self.load(filename) print "Network Loaded from %s" % (filename)
class ConvolutionalNeuralNetwork(Classifier): def __init__(self, rng, batch_size, nkerns=(20, 50)): self.batch_size = batch_size # 28x28 -> (24x24) // 2 = 12x12 self.layer0 = LeNetConvPoolLayer( rng=rng, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), ) # 12x12 -> (8x8) // 2 = 4x4 self.layer1 = LeNetConvPoolLayer(rng=rng, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5)) # TODO: make this an MLP rather than a hidden layer -> LogReg # self.layer2 = MLP() self.layer2 = HiddenLayer( rng=rng, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh, ) self.layer3 = LogisticRegression( n_in=500, n_out=10, ) def pre_logreg_output(self, x): layer0_input = x.reshape((self.batch_size, 1, 28, 28)) l0_output = self.layer0.output(layer0_input) l1_output = self.layer1.output(l0_output) l2_input = l1_output.flatten(2) l2_output = self.layer2.output(l2_input) return l2_output def negative_log_likelihood(self, x, y): output = self.pre_logreg_output(x) return self.layer3.negative_log_likelihood(output, y) def pred_label(self, x): output = self.pre_logreg_output(x) output = output.flatten(1) return self.layer3.pred_label(output) def errors(self, x, y): output = self.pre_logreg_output(x) return self.layer3.errors(output, y) def train(self, train_x, train_y, test_x, test_y, valid_x, valid_y, alpha=0.13, batch_size=500, l1_reg=0., l2_reg=0.0, n_epochs=1000): x = T.matrix('x') y = T.ivector('y') batch_size = self.batch_size layer0_input = x.reshape((batch_size, 1, 28, 28)) cost = self.negative_log_likelihood(layer0_input, y) params = self.layer0.params + self.layer1.params + self.layer2.params + self.layer3.params grads = T.grad(cost, params) updates = [(param, param - alpha * grad) for param, grad in zip(params, grads)] index = T.lscalar() train_func = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size], }) best_loss = self.run_batches(train_x, train_y, test_x, test_y, valid_x, valid_y, x, y, train_model_func=train_func, batch_size=batch_size, n_epochs=n_epochs) return best_loss
def evaluate_lenet5(train, test, valid, learning_rate=0.1, n_epochs=200, nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :param dataset train: Fuel dataset to use for training. :param dataset test: Fuel dataset to use for testing. :param dataset valid: Fuel dataset to use for validation. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) train_stream = DataStream.default_stream( train, iteration_scheme=SequentialScheme(train.num_examples, batch_size)) valid_stream = DataStream.default_stream( valid, iteration_scheme=SequentialScheme(train.num_examples, batch_size)) test_stream = DataStream.default_stream( test, iteration_scheme=SequentialScheme(train.num_examples, batch_size)) x = T.tensor4('x') yi = T.imatrix('y') y = yi.reshape((yi.shape[0],)) ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=x, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [x, yi], layer3.errors(y) ) validate_model = theano.function( [x, yi], layer3.errors(y) ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [x, yi], cost, updates=updates )
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception( "The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode)) layer1.append( HiddenLayer(rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=10, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b)) # hidden_layer_w = layer1[i].W # hidden_layer_b = layer1[i].b layer2.append( LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) logistic_layer_w = layer2[i].W logistic_layer_b = layer2[i].b local_params.append(layer0[i].params + layer1[i].params) share_params = list(layer2[0].params) # construct the parameter array. params = list(layer2[0].params) for i in xrange(data_count): params += layer1[0].params + layer0[i].params # data_name = "car" para_path = "data/" + data_name + "/log_model/" + pooling_mode + ".model" traintext = [ "data/" + data_names[i] + "/train/text" for i in xrange(data_count) ] trainlabel = [ "data/" + data_names[i] + "/train/label" for i in xrange(data_count) ] testtext = [ "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count) ] testlabel = [ "data/" + test_data_names[i] + "/test/label" for i in xrange(data_count) ] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.01 local_learning_rate = 0.1 n_batches = list() print "Loading test data." for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus( [0, 100000]) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." if mode == "train": train_model.append( theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] })) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus( [0, 1000]) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append( theano.function( [], [ cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels })) print "Compiled." costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ i]() print "Valid current model :", data_names[i] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "threshold: ", threshold[index_of_one] if mode == "test": return print "Start to train." epoch = 0 n_epochs = 10 ite = 0 # ####Validate the model#### # for dataset_index in xrange(data_count): # costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]() # print "Valid current model :", data_names[dataset_index] # print "Cost: ", costNum # print "Error: ", errorNum # # fpr, tpr, _ = roc_curve(real_label, pred_prob) # roc_auc = auc(fpr, tpr) # print "data_name: ", data_name # print "ROC: ", roc_auc # fpr, tpr, threshold = roc_curve(real_label, pred_label) # index_of_one = list(threshold).index(1) # print "TPR: ", tpr[index_of_one] # print "FPR: ", fpr[index_of_one] # print "threshold: ", threshold[index_of_one] while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data print "dataset_index: %d, i: %d" % (dataset_index, i) costNum, errorNum, pred_label, real_label = train_model[ dataset_index](i) ite = ite + 1 # for padding data if (ite % 10 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ dataset_index]() print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved."
def evaluate_lenet5(learning_rate=0.01, n_epochs=10000, dataset='cifar-10-batches-py', nkerns=[32, 64, 128], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # Example of how to reshape and display input # a=train_set_x[0].reshape((3,1024,1)).eval() # make_filter_fig(fname='results/input.png', # filters=a, # combine_chans=True) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (32, 32) # this is the size of MNIST images nChannels = 3 # the number of channels print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer reshaped_input = x.reshape((batch_size, 3, 32, 32)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (32-5+1+4,32-5+1+4)=(32,32) # maxpooling reduces this further to (32/2,32/2) = (16,16) # 4D output tensor is thus of shape (batch_size,nkerns[0],16,16) conv0 = LeNetConvPoolLayer( rng, input=reshaped_input, image_shape=(batch_size, 3, 32, 32), filter_shape=(nkerns[0], 3, 5, 5), filter_pad=2, poolsize=(2, 2)) # conv0_vis = HiddenLayer(rng, input=conv0.output.flatten(2), # n_in=nkerns[0] * 16 * 16, # n_out=3 * 32 * 32, activation=T.tanh) # print conv0_vis.W.eval().shape # (8192, 3072) # Construct the second convolutional pooling layer # filtering reduces the image size to (16-5+1+2,16-5+1+2)=(14,14) # maxpooling reduces this further to (14/2,14/2) = (7,7) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],7,7) conv1 = LeNetConvPoolLayer( rng, input=conv0.output, image_shape=(batch_size, nkerns[0], 16, 16), filter_shape=(nkerns[1], nkerns[0], 5, 5), filter_pad=1, poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size,128*4*4) = (batch_size,2048) hidden_input = conv1.output.flatten(2) # construct a fully-connected sigmoidal layer hidden = HiddenLayer(rng, input=hidden_input, n_in=nkerns[1] * 7 * 7, n_out=1024, activation=T.tanh) hidden_vis = HiddenLayer(rng, input=hidden.output, n_in=1024, n_out=3072, activation=T.nnet.sigmoid) # classify the values of the fully-connected sigmoidal layer softmax = LogisticRegression(input=hidden.output, n_in=1024, n_out=10) softmax_vis = HiddenLayer(rng, input=softmax.p_y_given_x, n_in=10, n_out=3072, activation=T.nnet.sigmoid) # the cost we minimize during training is the NLL of the model cost = softmax.negative_log_likelihood(y) hidden_vis_cost = hidden_vis.reconstruction_cost(x) softmax_vis_cost = softmax_vis.reconstruction_cost(x) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], softmax.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], softmax.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = softmax.params + hidden.params + conv1.params + conv0.params hidden_vis_params = hidden_vis.params softmax_vis_params = softmax_vis.params # create a list of gradients for all model parameters grads = T.grad(cost, params) hidden_vis_grads = T.grad(hidden_vis_cost, hidden_vis_params) softmax_vis_grads = T.grad(softmax_vis_cost, softmax_vis_params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) for param_i, grad_i in zip(hidden_vis_params, hidden_vis_grads): updates.append((param_i, param_i - learning_rate * grad_i)) for param_i, grad_i in zip(softmax_vis_params, softmax_vis_grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) print '... training' patience = 1000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False costs = [] valid = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index cost_ij = train_model(minibatch_index) costs.append(cost_ij) if iter % 100 == 0: print('Step %d Cost %f' % (iter, cost_ij)) make_filter_fig(fname='results/hidden.png', filters=hidden_vis.W.T.eval().reshape((3,1024,1024)), filter_start=0, num_filters=16*16, combine_chans=True) make_filter_fig(fname='results/softmax.png', filters=softmax_vis.W.T.eval().reshape((3,1024,10)), filter_start=0, num_filters=10, combine_chans=True) # rs = conv0_vis.W.reshape((3, nkerns[0] * 16 * 16, 32*32)) # (3,8192,1024) # rs2 = rs.dimshuffle(0,2,1) # make_filter_fig(fname='results/conv0.png', # filters=rs2.eval(), # filter_start=0, # num_filters=16*16, # combine_chans=True) # rs = conv0_vis.W.T # (3072,8192) # rs2 = rs.reshape((3, 1024, 8192)) # make_filter_fig(fname='results/conv0-alt.png', # filters=rs2.eval(), # filter_start=0, # num_filters=16*16, # combine_chans=True) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) valid.append(this_validation_loss * 100.) print('epoch %i, minibatch %i/%i, validation error %.2f%%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter best_params = params # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print(('New Best! epoch %i, minibatch %i/%i, test error of best ' 'model %.2f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return best_params
def extract_256array(img_dir): #print "Loading params..." loaded_params=load_params('params-momentum_weightdecay-NEW-BIg4class-01.pkl') num_images=len(glob.glob(img_dir + '*.jpg')) if batch_size==0: sys.exit() ## MODEL CNN # Dichiarazione variabile simbolica immagine input x = T.tensor4('x') # the data is presented as rasterized images # Variabili simboliche per i parametri caricati da file W1 = theano.shared(loaded_params[0][0].get_value());B1 = theano.shared(loaded_params[0][1].get_value()) W2=theano.shared(loaded_params[1][0].get_value());B2 = theano.shared(loaded_params[1][1].get_value()) W3=theano.shared(loaded_params[2][0].get_value());B3 = theano.shared(loaded_params[2][1].get_value()) W4=theano.shared(loaded_params[3][0].get_value());B4 = theano.shared(loaded_params[3][1].get_value()) W5=theano.shared(loaded_params[4][0].get_value());B5 = theano.shared(loaded_params[4][1].get_value()) #batch_size=1 #print "Building model..." layer0_input = x.reshape((batch_size, 3, 61, 61)) # build symbolic expression that computes the convolution of input with filters in w conv_out1=MyConvnetLayer(W1,B1,input=layer0_input,filter_shape=(64, 3, 5, 5),image_shape=(batch_size, 3, 61, 61),conv_stride=(2,2),pool_stride=(2,2),poolsize=(3,3)) # conv_out2=MyConvnetLayer(W2,B2,input=conv_out1.output,filter_shape=(256, 64, 5, 5),image_shape=(batch_size, 64, 14, 14),conv_stride=(1,1)) conv_out3=MyConvnetLayer(W3,B3,input=conv_out2.output,filter_shape=(128, 256, 3, 3),image_shape=(batch_size, 256, 10, 10),conv_stride=(1,1)) conv_out4=MyConvnetLayer(W4,B4,input=conv_out3.output,filter_shape=(128, 128, 3, 3),image_shape=(batch_size, 128, 8, 8),conv_stride=(1,1)) layer5_input = conv_out4.output.flatten(2) #construct a fully-connected sigmoidal layer full_5 = HiddenLayer( W5,B5, input=layer5_input, n_in=128 * 6 * 6, n_out=256, activation=T.tanh ) # create theano function to compute filtered images f_layer5 = theano.function([x], full_5.output, allow_input_downcast=True,on_unused_input='ignore' ) ## END MODEL CNN if num_images<batch_size: batchsize=num_images else: batchsize=batch_size num_batch=int(math.ceil(num_images/float(batchsize))) features=np.zeros((num_images,256),theano.config.floatX,'C') for j in range(1,num_batch+1): images=np.zeros((batch_size,3,61,61),theano.config.floatX,'C') i=0 num_img_batch=min(batchsize,num_images-batchsize*(j-1)) for i_img in range(1,num_img_batch+1): img_name=img_dir+str(i_img+batchsize*(j-1))+'.jpg' img = Image.open(img_name) img_res=img.resize((61,61), PIL.Image.ANTIALIAS) # dimensions are (height, width, channel) img_res=sub_mean(img_res) img_res = numpy.asarray(img_res, dtype=theano.config.floatX) / 256. # put image in 4D tensor of shape (1, 3, height, width) img_ = img_res.transpose(2, 0, 1).reshape(1, 3, 61, 61) images[i,:,:,:]=img_ i=i+1 feature_256=f_layer5(images) range_feat=range(batchsize*(j-1),batchsize*(j-1)+ num_img_batch) for k in range(0,num_img_batch): features[range_feat[k]]=feature_256[k] #print feature_256 return features
ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1]) # [?, 14, 14, 32] # conv and pool layer1 layer1_conv = ConvLayer(layer0_pool.output, filter_shape=[5, 5, 32, 64], strides=[1, 1, 1, 1], activation=tf.nn.relu, padding="SAME") # [?, 14, 14, 64] layer1_pool = MaxPoolLayer(layer1_conv.output, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1]) # [?, 7, 7, 64] # flatten layer layer2_flatten = FlattenLayer(layer1_pool.output, shape=[-1, 7 * 7 * 64]) # fully-connected layer layer3_fullyconn = HiddenLayer(layer2_flatten.output, n_in=7 * 7 * 64, n_out=256, activation=tf.nn.relu) # dropout layer layer3_dropout = DropoutLayer(layer3_fullyconn.output, keep_prob=0.5) # the output layer layer4_output = LogisticRegression(layer3_dropout.output, n_in=256, n_out=10) # params for training params = layer0_conv.params + layer1_conv.params + layer3_fullyconn.params + layer4_output.params # train dicts for dropout train_dicts = layer3_dropout.train_dicts # prediction dicts for dropout pred_dicts = layer3_dropout.pred_dicts
def sgd_optimize(learning_rate=0.1, n_epochs=200, batch_size=500, nkerns=[20, 50]): # Load input train, valid, test = util.load() print "loading 0 - ", train[0].shape[0], " train inputs in gpu memory" train_x, train_y = util.create_theano_shared(train) print "loading 0 - ", valid[0].shape[0], " validation inputs in gpu memory" valid_x, valid_y = util.create_theano_shared(valid) print "loading 0 - ", test[0].shape[0], " test inputs in gpu memory" test_x, test_y = util.create_theano_shared(test) # Define symbolic input matrices print "Building Model..." index = T.iscalar() x = T.matrix("x") y = T.ivector("y") random_generator = numpy.random.RandomState(1) # Create Layer0 of Lenet Model layer0_input = x.reshape( (batch_size, 1, 28, 28) ) filter_shape0 = (nkerns[0], 1, 5, 5) image_shape0 = (batch_size, 1, 28, 28) layer0 = LeNetConvPoolLayer(layer0_input, filter_shape0, image_shape0, random_generator) # Create Layer1 of Lenet model filter_shape1 = (nkerns[1], nkerns[0], 5, 5) image_shape1 = (batch_size, nkerns[0], 12, 12) layer1 = LeNetConvPoolLayer(layer0.output, filter_shape1, image_shape1, random_generator) # Create Layer2 which is a simple MLP hidden layer layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer(layer2_input, nkerns[1] * 4 * 4, 500, random_generator) # Finally, Layer3 is LogisticRegression layer layer3 = LogisticRegression(layer2.output, 500, 10) # Define error error = layer3.error(y) # Create cost function cost = layer3.negative_log_likelihood(y) # Gradient and update functions params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, wrt=params) updates = list() for i in range(len(params)): updates.append( (params[i], params[i] - learning_rate * grads[i]) ) # Train model train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens = { x: train_x[index*batch_size : (index+1)*batch_size], y: train_y[index*batch_size : (index+1)*batch_size] }) # Valid model valid_model = theano.function( inputs=[index], outputs=error, givens = { x: valid_x[index*batch_size : (index+1)*batch_size], y: valid_y[index*batch_size : (index+1)*batch_size] }) # Test Model test_model = theano.function( inputs=[index], outputs=error, givens={ x: test_x[index*batch_size : (index+1)*batch_size], y: test_y[index*batch_size : (index+1)*batch_size] }) # Create number of minibatches n_train_batches = train[0].shape[0] / batch_size n_valid_batches = valid[0].shape[0] / batch_size n_test_batches = test[0].shape[0] / batch_size # Finally, main loop for training util.train_test_model(n_epochs, train_model, valid_model, test_model, n_train_batches, n_valid_batches, n_test_batches)
def __init__(self, D, M, Q, Domain_number, D_Y, M_Y): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') self.Y = T.matrix('Y') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) mmd = MMD(M, Domain_number) mu_value = np.random.randn(M, D) Sigma_b_value = np.zeros((M, M)) + np.log(0.01) Z_value = np.random.randn(M, Q) ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=20, activation=T.nnet.relu, number='_x') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=20, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=20, n_out=Q, activation=T.nnet.relu, number='_S') ################################################################################# ###モデルの計算X側 m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight) Knn = ker.RBF(Xtilda) Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) mean_U = T.dot(Kinterval.T, self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI ############################################################################################## ###Y側の計算 ker_Y = kernel(Q, number='_Y') muY_value = np.random.randn(M_Y, D_Y) SigmaY_b_value = np.zeros((M_Y, M_Y)) + np.log(0.01) ZY_value = np.random.randn(M_Y, Q) lsY_value = np.zeros(1) + np.log(0.1) self.muY = theano.shared(value=muY_value, name='muY', borrow=True) self.SigmaY_b = theano.shared(value=SigmaY_b_value, name='SigmaY_b', borrow=True) self.ZY = theano.shared(value=ZY_value, name='ZY', borrow=True) self.lsY = theano.shared(value=lsY_value, name='lsY', borrow=True) epsY_NQ = srng.normal((N, Q)) epsY_M = srng.normal((M_Y, D_Y)) betaY0 = T.exp(self.lsY) betaY = T.tile(betaY0, N) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある SigmaY = T.tril(self.SigmaY_b - T.diag(T.diag(self.SigmaY_b)) + T.diag(T.exp(T.diag(self.SigmaY_b)))) #スケール変換 muY_scaled, SigmaY_scaled = ker_Y.sf2**0.5 * self.muY, ker_Y.sf2**0.5 * SigmaY XtildaY = m + S * epsY_NQ self.UY = muY_scaled + SigmaY_scaled.dot(epsY_M) KmmY = ker_Y.RBF(self.ZY) KmmInvY = sT.matrix_inverse(KmmY) KmnY = ker_Y.RBF(self.ZY, XtildaY) KnnY = ker_Y.RBF(XtildaY) KtildaY = KnnY - T.dot(KmnY.T, T.dot(KmmInvY, KmnY)) KintervalY = T.dot(KmmInvY, KmnY) mean_UY = T.dot(KintervalY.T, self.UY) betaIY = T.diag(betaY) CovarianceY = betaIY ############################################################################################## ###パラメータの格納 self.params = [] self.params_X = [self.mu, self.Sigma_b, self.Z, self.ls] self.params_Y = [self.muY, self.SigmaY_b, self.ZY, self.lsY] self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params_X.extend(ker.params) self.params_X.extend(mmd.params) self.params_Y.extend(ker_Y.params) self.global_params_X = {} for i in self.params_X: self.global_params_X[str(i)] = i self.global_params_Y = {} for i in self.params_Y: self.global_params_Y[str(i)] = i self.params.extend(self.params_X) self.params.extend(self.params_Y) self.params.extend(self.loc_params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i ############################################################################################### ###最終的な尤度 self.LL = (self.log_mvn(self.X, mean_U, Covariance) - 0.5 * T.sum(T.dot(betaI, Ktilda))) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv) self.LLY = (self.log_mvn(self.Y, mean_UY, CovarianceY) - 0.5 * T.sum(T.dot(betaIY, KtildaY))) self.KL_UY = -self.KLD_U(muY_scaled, SigmaY_scaled, KmmY, KmmInvY) self.KL_X = -self.KLD_X(m, S)
def __init__(self, rng, input, n_in, n_hiddens, n_out, dropout_rates, activation=None, n_slack=0): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: list of int :param n_hidden: number of hidden units """ self.params = [] self.W = [] self.b = [] self.W_actual = [] self.b_actual = [] # keep track of model input self.input = input # Multiple hidden layers print >> sys.stderr, dropout_rates last_layer_out = self.input last_layer_dropout = _dropout_from_layer(rng, self.input, p=dropout_rates[0]) last_layer_size = n_in slacks = numpy.append( numpy.asarray([n_slack], dtype='int32'), numpy.zeros((len(n_hiddens) - 1, ), dtype='int32')) for i in range(0, len(n_hiddens)): # dropped-out path: for training dropoutLayer = DropoutHiddenLayer(rng=rng, input=last_layer_dropout, activation=activation, n_in=last_layer_size, n_out=n_hiddens[i], dropout_rate=dropout_rates[i + 1], n_slack=slacks[i]) last_layer_dropout = dropoutLayer.output self.params += dropoutLayer.params self.W += [dropoutLayer.W] self.b += [dropoutLayer.b] # original (untouched) path: for testing hiddenLayer = HiddenLayer(rng=rng, input=last_layer_out, activation=activation, n_in=last_layer_size, n_out=n_hiddens[i], W=dropoutLayer.W * (1. - dropout_rates[i]), b=dropoutLayer.b, n_slack=slacks[i]) last_layer_out = hiddenLayer.output last_layer_size = n_hiddens[i] self.W_actual += [hiddenLayer.W] self.b_actual += [hiddenLayer.b] # The logistic regression layer gets as input the hidden units # of the hidden layer # Dropped-out path: for training self.dropoutLogRegressionLayer = LogisticRegression( rng=rng, input=last_layer_dropout, n_in=(n_hiddens[-1] if len(n_hiddens) > 0 else n_in), n_out=n_out) self.params += self.dropoutLogRegressionLayer.params # original (untouched) path: for testing self.logRegressionLayer = LogisticRegression( rng=rng, input=last_layer_out, n_in=(n_hiddens[-1] if len(n_hiddens) > 0 else n_in), n_out=n_out, W=self.dropoutLogRegressionLayer.W * (1. - dropout_rates[-1]), b=self.dropoutLogRegressionLayer.b) # prediction of the MLP is given by the prediction of the output of the # model, computed in the logistic regression layer self.dropout_errors = self.dropoutLogRegressionLayer.errors self.dropout_negative_log_likelihood = self.dropoutLogRegressionLayer.negative_log_likelihood self.y_pred = self.logRegressionLayer.y_pred self.errors = self.logRegressionLayer.errors self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
def __init__(self, D, M, Q, Domain_number, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) #mmd=MMD(M,Domain_number) mu_value = np.random.randn(M, D) * 1e-2 Sigma_b_value = np.zeros((M, M)) # + np.log(0.01) Z_value = np.random.randn(M, Q) ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') #self.hiddenLayer_hidden = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=Hiddenlayerdim1,n_out=Hiddenlayerdim2,activation=T.nnet.relu,number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) #self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) #self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) #self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((N, D)) beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) #Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) #Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) #Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) F = T.dot(Kmn.T, T.dot(KmmInv, self.U)) + T.dot( T.maximum(Ktilda, 1e-16)**0.5, eps_ND) #Kinterval=T.dot(KmmInv,Kmn) mean_U = F #T.dot(Kinterval.T,self.U) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = self.log_mvn(self.X, mean_U, Covariance) # - 0.5*T.sum(T.dot(betaI,Ktilda))) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): rng = numpy.random.RandomState(23455) datasets = load_data(dataset) #加载数据 train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # 定义几个变量,index表示batch下标,x表示输入的训练数据,y对应其标签 index = T.lscalar() x = T.matrix('x') y = T.ivector('y') ############ # 构建模型 # ############ print '... building the model' # 我们加载进来的batch大小的数据是(batch_size, 28 * 28),但是LeNetConvPoolLayer的输入是四维的,所以要reshape layer0_input = x.reshape((batch_size, 1, 28, 28)) # layer0即第一个LeNetConvPoolLayer层 # 输入的单张图片(28,28),经过conv得到(28-5+1 , 28-5+1) = (24, 24), # 经过maxpooling得到(24/2, 24/2) = (12, 12) # 因为每个batch有batch_size张图,第一个LeNetConvPoolLayer层有nkerns[0]个卷积核, # 故layer0输出为(batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # layer1即第二个LeNetConvPoolLayer层 # 输入是layer0的输出,每张特征图为(12,12),经过conv得到(12-5+1, 12-5+1) = (8, 8), # 经过maxpooling得到(8/2, 8/2) = (4, 4) # 因为每个batch有batch_size张图(特征图),第二个LeNetConvPoolLayer层有nkerns[1]个卷积核 # ,故layer1输出为(batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) #前面定义好了两个LeNetConvPoolLayer(layer0和layer1),layer1后面接layer2,这是一个全连接层,相当于MLP里面的隐含层 #故可以用MLP中定义的HiddenLayer来初始化layer2,layer2的输入是二维的(batch_size, num_pixels) , #故要将上层中同一张图经不同卷积核卷积出来的特征图合并为一维向量, #也就是将layer1的输出(batch_size, nkerns[1], 4, 4)flatten为(batch_size, nkerns[1]*4*4)=(500,800),作为layer2的输入。 #(500,800)表示有500个样本,每一行代表一个样本。layer2的输出大小是(batch_size,n_out)=(500,500) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # 最后一层layer3是分类层,用的是逻辑回归中定义的LogisticRegression, # layer3的输入是layer2的输出(500,500),layer3的输出就是(batch_size,n_out)=(500,10) layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # 代价函数NLL cost = layer3.negative_log_likelihood(y) # test_model计算测试误差,x、y根据给定的index具体化,然后调用layer3, # layer3又会逐层地调用layer2、layer1、layer0,故test_model其实就是整个CNN结构, # test_model的输入是x、y,输出是layer3.errors(y)的输出,即误差。 test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # 下面是train_model,涉及到优化算法即SGD,需要计算梯度、更新参数 params = layer3.params + layer2.params + layer1.params + layer0.params # 参数集 grads = T.grad(cost, params) # 对各个参数的梯度 # 因为参数太多,在updates规则里面一个一个具体地写出来是很麻烦的, # 所以下面用了一个for..in..,自动生成规则对(param_i, param_i - learning_rate * grad_i) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] #train_model,代码分析同test_model。train_model里比test_model、validation_model多出updates规则 train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) ############ # 训练模型 # ############ print '... training' patience = 10000 # 提早终止参数 patience_increase = 2 improvement_threshold = 0.995 # 这样设置validation_frequency可以保证每一次epoch都会在验证集上测试。 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break layer0.save_net("layer0") layer1.save_net("layer1") layer2.save_net("layer2") layer3.save_net("layer3") end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=100, dataset='F:/MOUD/0MOUD/jul14/x50_1/cktest/moud6.pkl.gz', nkerns=[5, 5, 5, 5, 5, 5, 5, 5, 5], batch_size=50, dirn='iti', indexd=0): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ global layer0gW global layer1gW global layer1bgW global layer1cgW global layer1dgW global layer1egW global layer1fgW global layer1ggW global layer1hgW global layer2gW global layer3gW global layer0gb global layer1gb global layer1bgb global layer1cgb global layer1dgb global layer1egb global layer1fgb global layer1ggW global layer1hgW global layer2gb global layer3gb global all_test global batchm global eval_print1 global eval_print2 global eval_print3 global neuron global epoch_cd global indk epoch_cd = 2 neuron = 5 batchm = 20 batch_size = batchm for nk in range(9): nkerns[nk] = neuron dirgtest = dirn l_r = T.scalar('l_r', dtype=theano.config.floatX) rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. im1x = 100 + 8 im1y = 100 poolx = 1 pooly = 1 layer0_input = x.reshape((batch_size, 1, im1x, im1y)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) nk1x = 7 nk1y = im1y # layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, im1x, im1y), filter_shape=(nkerns[0], 1, nk1x, nk1y), poolsize=(poolx, pooly)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2x = (im1x - nk1x + 1) / poolx im2y = (im1y - nk1y + 1) / pooly #im2x = (im1x+nk1x-1)/poolx #im2y = (im1y+nk1y-1)/pooly nk2x = 6 nk2y = im2y layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], im2x, im2y), filter_shape=(nkerns[1], nkerns[0], nk2x, nk2y), poolsize=(poolx, pooly)) # Construct the third convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2bx = (im2x - nk2x + 1) / poolx im2by = (im2y - nk2y + 1) / pooly #im2bx = (im2x+nk2x-1)/poolx #im2by = (im2y+nk2y-1)/pooly nk2bx = 5 nk2by = im2by layer1b = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], im2bx, im2by), filter_shape=(nkerns[2], nkerns[1], nk2bx, nk2by), poolsize=(poolx, pooly)) # Construct the fourth convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2cx = (im2bx - nk2bx + 1) / poolx im2cy = (im2by - nk2by + 1) / pooly nk2cx = 4 nk2cy = im2cy layer1c = LeNetConvPoolLayer(rng, input=layer1b.output, image_shape=(batch_size, nkerns[2], im2cx, im2cy), filter_shape=(nkerns[3], nkerns[2], nk2cx, nk2cy), poolsize=(poolx, pooly)) # Construct the fifth convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2dx = (im2cx - nk2cx + 1) / poolx im2dy = (im2cy - nk2cy + 1) / pooly nk2dx = 3 nk2dy = im2dy layer1d = LeNetConvPoolLayer(rng, input=layer1c.output, image_shape=(batch_size, nkerns[3], im2dx, im2dy), filter_shape=(nkerns[4], nkerns[3], nk2dx, nk2dy), poolsize=(poolx, pooly)) # Construct the sixth convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2ex = (im2dx - nk2dx + 1) / poolx im2ey = (im2dy - nk2dy + 1) / pooly nk2ex = 3 nk2ey = im2ey layer1e = LeNetConvPoolLayer(rng, input=layer1d.output, image_shape=(batch_size, nkerns[4], im2ex, im2ey), filter_shape=(nkerns[5], nkerns[4], nk2ex, nk2ey), poolsize=(poolx, pooly)) # Construct the seven convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) im2fx = (im2ex - nk2ex + 1) / poolx im2fy = (im2ey - nk2ey + 1) / pooly nk2fx = 3 nk2fy = im2fy layer1f = LeNetConvPoolLayer(rng, input=layer1e.output, image_shape=(batch_size, nkerns[5], im2fx, im2fy), filter_shape=(nkerns[6], nkerns[5], nk2fx, nk2fy), poolsize=(poolx, pooly)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1b.output.flatten(2) #im3x = (im2hx-nk2hx+1)/poolx #im3y = (im2hy-nk2hy+1)/pooly im3x = (im2bx - nk2bx + 1) / poolx im3y = (im2by - nk2by + 1) / pooly layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[4] * im3x * im3y, n_out=100, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=100, n_out=2) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) test_model2 = theano.function( [index], layer3.errors2(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent # params = layer3.params + layer2.params + layer1f.params + layer1e.params + layer1d.params + layer1c.params + layer1b.params + layer1.params + layer0.params params = layer3.params + layer2.params + layer1b.params + layer1.params + layer0.params #layer1h.params + layer1g.params + layer1f.params + layer1e.params + layer1d.params + layer1c.params + layer1b.params + layer1.params + layer0.params if indexd > indk: epoch_cd = 1 learning_rate = 0 n_epochs = 1 f = file(dirgtest + "/weights/layer0w_" + str(indexd) + ".save", 'rb') lay_params = cPickle.load(f) Wl1, bl1 = lay_params layer0.W.set_value(Wl1.get_value()) layer0.b.set_value(bl1.get_value()) f.close() f = file(dirgtest + "/weights/layer1w_" + str(indexd) + ".save", 'rb') lay_params = cPickle.load(f) Wl1, bl1 = lay_params layer1.W.set_value(Wl1.get_value()) layer1.b.set_value(bl1.get_value()) f.close() f = file(dirgtest + "/weights/layer1bw_" + str(indexd) + ".save", 'rb') lay_params = cPickle.load(f) Wl1, bl1 = lay_params layer1b.W.set_value(Wl1.get_value()) layer1b.b.set_value(bl1.get_value()) f.close() #f = file(dirgtest+"/weights/layer1cw_"+str(indexd)+".save",'rb') #lay_params = cPickle.load(f) #Wl1, bl1 = lay_params #layer1c.W.set_value(Wl1.get_value()); #layer1c.b.set_value(bl1.get_value()); #f.close() #f = file(dirgtest+"/weights/layer1dw_"+str(indexd)+".save",'rb') #lay_params = cPickle.load(f) #Wl1, bl1 = lay_params #layer1d.W.set_value(Wl1.get_value()); #layer1d.b.set_value(bl1.get_value()); #f.close() #f = file(dirgtest+"/weights/layer1ew_"+str(indexd)+".save",'rb') #lay_params = cPickle.load(f) #Wl1, bl1 = lay_params #layer1e.W.set_value(Wl1.get_value()); #layer1e.b.set_value(bl1.get_value()); #f.close() #f = file(dirgtest+"/weights/layer1fw_"+str(indexd)+".save",'rb') #lay_params = cPickle.load(f) #Wl1, bl1 = lay_params #layer1f.W.set_value(Wl1.get_value()); #layer1f.b.set_value(bl1.get_value()); #f.close() f = file(dirgtest + "/weights/layer2w_" + str(indexd) + ".save", 'rb') lay_params = cPickle.load(f) Wl1, bl1 = lay_params layer2.W.set_value(Wl1.get_value()) layer2.b.set_value(bl1.get_value()) f.close() f = file(dirgtest + "/weights/layer3w_" + str(indexd) + ".save", 'rb') lay_params = cPickle.load(f) Wl1, bl1 = lay_params layer3.W.set_value(Wl1.get_value()) layer3.b.set_value(bl1.get_value()) f.close() # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - l_r * grad_i) for param_i, grad_i in zip(params, grads)] #updates = [ # (param_i, param_i - learning_rate * grad_i) # for param_i, grad_i in zip(params, grads) #] train_model = theano.function( [index, l_r], #[index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 learning_rate = 0.99 * learning_rate if epoch == 1: print layer0.W.get_value().shape print layer0.b.get_value().shape eval_set_x = test_set_x eval_shape = train_set_x.get_value(borrow=True).shape eval_layer2 = theano.function( [index], layer0_input, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print1 = [eval_layer2(i) for i in xrange(n_test_batches)] eval_set_x = train_set_x eval_layer2 = theano.function( [index], layer0_input, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print2 = [eval_layer2(i) for i in xrange(n_train_batches)] eval_set_x = valid_set_x eval_layer2 = theano.function( [index], layer0_input, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print3 = [eval_layer2(i) for i in xrange(n_valid_batches)] if indk == 10: Wl1, bl1 = morbrun1(nk1x, nk1y, im1x, im1y) learning_rate = 0 if epoch == 2: print layer1.W.get_value().shape print layer1.b.get_value().shape layer0.W.set_value(Wl1.get_value()) layer0.b.set_value(bl1.get_value()) eval_set_x = test_set_x eval_shape = train_set_x.get_value(borrow=True).shape eval_layer2 = theano.function( [index], layer0.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print1 = [eval_layer2(i) for i in xrange(n_test_batches)] eval_set_x = train_set_x eval_layer2 = theano.function( [index], layer0.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print2 = [eval_layer2(i) for i in xrange(n_train_batches)] eval_set_x = valid_set_x eval_layer2 = theano.function( [index], layer0.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print3 = [eval_layer2(i) for i in xrange(n_valid_batches)] if indk == 10: Wl2, bl2 = morbrun1(nk2x, nk2y, im2x, im2y, neuron) if epoch == 3: print layer1b.W.get_value().shape layer1.W.set_value(Wl2.get_value()) layer1.b.set_value(bl2.get_value()) layer0.W.set_value(Wl1.get_value()) layer0.b.set_value(bl1.get_value()) eval_set_x = test_set_x eval_shape = train_set_x.get_value(borrow=True).shape eval_layer2 = theano.function( [index], layer1.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print1 = [eval_layer2(i) for i in xrange(n_test_batches)] eval_set_x = train_set_x eval_layer2 = theano.function( [index], layer1.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print2 = [eval_layer2(i) for i in xrange(n_train_batches)] eval_set_x = valid_set_x eval_layer2 = theano.function( [index], layer1.output, givens={ x: eval_set_x[index * batch_size:(index + 1) * batch_size] }) eval_print3 = [eval_layer2(i) for i in xrange(n_valid_batches)] if indk == 10: Wl3, bl3 = morbrun1(nk2bx, nk2by, im2bx, im2by, neuron) layer1b.W.set_value(Wl3.get_value()) layer1b.b.set_value(bl3.get_value()) layer1.W.set_value(Wl2.get_value()) layer1.b.set_value(bl2.get_value()) layer0.W.set_value(Wl1.get_value()) layer0.b.set_value(bl1.get_value()) n_in = nkerns[4] * im3x * im3y n_out = 100 W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) layer2.W.set_value(W_values) layer2.b.set_value(numpy.zeros(n_out)) n_in = 100 n_out = 2 W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) layer3.W.set_value(W_values) layer3.b.set_value(numpy.zeros(n_out)) learning_rate = 0.01 # # # if epoch == 4: # print layer1c.W.get_value().shape # # layer1b.W.set_value(Wl3.get_value()); # layer1b.b.set_value(bl3.get_value()); # layer1.W.set_value(Wl2.get_value()); # layer1.b.set_value(bl2.get_value()); # layer0.W.set_value(Wl1.get_value()); # layer0.b.set_value(bl1.get_value()); # # eval_set_x = test_set_x; # eval_shape = train_set_x.get_value(borrow=True).shape; # eval_layer2 = theano.function([index], layer1b.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # eval_print1 = [ # eval_layer2(i) # for i in xrange(n_test_batches) # ] # # eval_set_x = train_set_x; # # eval_layer2 = theano.function([index], layer1b.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print2 = [ # eval_layer2(i) # for i in xrange(n_train_batches) # ] # # eval_set_x = valid_set_x; # # eval_layer2 = theano.function([index], layer1b.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print3 = [ # eval_layer2(i) # for i in xrange(n_valid_batches) # ] # if indk == 10: # Wl4, bl4 = morbrun1(nk2cx,nk2cy,im2cx,im2cy,neuron) # # # # if epoch == 5: # print layer1d.W.get_value().shape # layer1c.W.set_value(Wl4.get_value()); # layer1c.b.set_value(bl4.get_value()); # layer1b.W.set_value(Wl3.get_value()); # layer1b.b.set_value(bl3.get_value()); # layer1.W.set_value(Wl2.get_value()); # layer1.b.set_value(bl2.get_value()); # layer0.W.set_value(Wl1.get_value()); # layer0.b.set_value(bl1.get_value()); # eval_set_x = test_set_x; # eval_shape = train_set_x.get_value(borrow=True).shape; # eval_layer2 = theano.function([index], layer1c.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # eval_print1 = [ # eval_layer2(i) # for i in xrange(n_test_batches) # ] # # eval_set_x = train_set_x; # # eval_layer2 = theano.function([index], layer1c.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print2 = [ # eval_layer2(i) # for i in xrange(n_train_batches) # ] # # eval_set_x = valid_set_x; # # eval_layer2 = theano.function([index], layer1c.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print3 = [ # eval_layer2(i) # for i in xrange(n_valid_batches) # ] # # if indk == 10: # Wl5, bl5 = morbrun1(nk2dx,nk2dy,im2dx,im2dy,neuron) # # # # if epoch == 6: # print layer1e.W.get_value().shape # layer1d.W.set_value(Wl5.get_value()); # layer1d.b.set_value(bl5.get_value()); # layer1c.W.set_value(Wl4.get_value()); # layer1c.b.set_value(bl4.get_value()); # layer1b.W.set_value(Wl3.get_value()); # layer1b.b.set_value(bl3.get_value()); # layer1.W.set_value(Wl2.get_value()); # layer1.b.set_value(bl2.get_value()); # layer0.W.set_value(Wl1.get_value()); # layer0.b.set_value(bl1.get_value()); # # eval_set_x = test_set_x; # eval_shape = train_set_x.get_value(borrow=True).shape; # eval_layer2 = theano.function([index], layer1d.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # eval_print1 = [ # eval_layer2(i) # for i in xrange(n_test_batches) # ] # # eval_set_x = train_set_x; # # eval_layer2 = theano.function([index], layer1d.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print2 = [ # eval_layer2(i) # for i in xrange(n_train_batches) # ] # # eval_set_x = valid_set_x; # # eval_layer2 = theano.function([index], layer1d.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print3 = [ # eval_layer2(i) # for i in xrange(n_valid_batches) # ] # # if indk == 10: # Wl6,bl6 = morbrun1(nk2ex,nk2ey,im2ex,im2ey,neuron) # # # if epoch == 7: # print layer1f.W.get_value().shape # layer1e.W.set_value(Wl6.get_value()); # layer1e.b.set_value(bl6.get_value()); # layer1d.W.set_value(Wl5.get_value()); # layer1d.b.set_value(bl5.get_value()); # layer1c.W.set_value(Wl4.get_value()); # layer1c.b.set_value(bl4.get_value()); # layer1b.W.set_value(Wl3.get_value()); # layer1b.b.set_value(bl3.get_value()); # layer1.W.set_value(Wl2.get_value()); # layer1.b.set_value(bl2.get_value()); # layer0.W.set_value(Wl1.get_value()); # layer0.b.set_value(bl1.get_value()); # # eval_set_x = test_set_x; # eval_shape = train_set_x.get_value(borrow=True).shape; # eval_layer2 = theano.function([index], layer1e.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # eval_print1 = [ # eval_layer2(i) # for i in xrange(n_test_batches) # ] # # eval_set_x = train_set_x; # # eval_layer2 = theano.function([index], layer1e.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print2 = [ # eval_layer2(i) # for i in xrange(n_train_batches) # ] # # eval_set_x = valid_set_x; # # eval_layer2 = theano.function([index], layer1e.output, # givens={ # x: eval_set_x[index * batch_size: (index + 1) * batch_size]}) # # eval_print3 = [ # eval_layer2(i) # for i in xrange(n_valid_batches) # ] # # if indk == 10: # Wl7,bl7 = morbrun1(nk2fx,nk2fy,im2fx,im2fy,neuron) # layer1f.W.set_value(Wl7.get_value()); # layer1f.b.set_value(bl7.get_value()); # layer1e.W.set_value(Wl6.get_value()); # layer1e.b.set_value(bl6.get_value()); # layer1d.W.set_value(Wl5.get_value()); # layer1d.b.set_value(bl5.get_value()); # layer1c.W.set_value(Wl4.get_value()); # layer1c.b.set_value(bl4.get_value()); # layer1b.W.set_value(Wl3.get_value()); # layer1b.b.set_value(bl3.get_value()); # layer1.W.set_value(Wl2.get_value()); # layer1.b.set_value(bl2.get_value()); # layer0.W.set_value(Wl1.get_value()); # layer0.b.set_value(bl1.get_value()); # # n_in=nkerns[4] * im3x * im3y # n_out=100 # W_values = numpy.asarray( # rng.uniform( # low=-numpy.sqrt(6. / (n_in + n_out)), # high=numpy.sqrt(6. / (n_in + n_out)), # size=(n_in, n_out) # ), # dtype=theano.config.floatX # ) # layer2.W.set_value(W_values); # layer2.b.set_value(numpy.zeros(n_out)) # n_in=100 # n_out=2 # W_values = numpy.asarray( # rng.uniform( # low=-numpy.sqrt(6. / (n_in + n_out)), # high=numpy.sqrt(6. / (n_in + n_out)), # size=(n_in, n_out) # ), # dtype=theano.config.floatX # ) # layer3.W.set_value(W_values); # layer3.b.set_value(numpy.zeros(n_out)) # learning_rate=0.01 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index, learning_rate) #cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() try: os.remove(dirgtest + "/run2/pred_y" + str(indexd) + ".csv") except OSError: pass predy = open(dirgtest + "/run2/pred_y" + str(indexd) + ".csv", 'a') test_losses = [test_model2(i) for i in xrange(n_test_batches)] np.savetxt(predy, test_losses, delimiter='\n') predy.close() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) all_test += test_score print str(all_test) + ' ' + str(indexd) with open(dirgtest + "/run2/cv_score.txt", "a") as myfile: myfile.write(str(test_score) + "\n") if indk == 10: print "saving \n" f = file(dirgtest + "/weights/layer0w_" + str(indexd) + ".save", 'wb') cPickle.dump(layer0.params, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() f = file(dirgtest + "/weights/layer1w_" + str(indexd) + ".save", 'wb') cPickle.dump(layer1.params, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() f = file(dirgtest + "/weights/layer1bw_" + str(indexd) + ".save", 'wb') cPickle.dump(layer1b.params, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() #f = file(dirgtest+"/weights/layer1cw_"+str(indexd)+".save", 'wb') #cPickle.dump(layer1c.params, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.close() # #f = file(dirgtest+"/weights/layer1dw_"+str(indexd)+".save", 'wb') #cPickle.dump(layer1d.params, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.close() # #f = file(dirgtest+"/weights/layer1ew_"+str(indexd)+".save", 'wb') #cPickle.dump(layer1e.params, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.close() # #f = file(dirgtest+"/weights/layer1fw_"+str(indexd)+".save", 'wb') #cPickle.dump(layer1f.params, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.close() f = file(dirgtest + "/weights/layer2w_" + str(indexd) + ".save", 'wb') cPickle.dump(layer2.params, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() f = file(dirgtest + "/weights/layer3w_" + str(indexd) + ".save", 'wb') cPickle.dump(layer3.params, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close()
def build_lenet5(params, nkerns=[48, 128, 192, 192], batch_size=1): # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (50, 50) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 50, 50)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(params[5], input=layer0_input, image_shape=(batch_size, 1, 50, 50), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(params[4], input=layer0.output, image_shape=(batch_size, nkerns[0], 23, 23), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2)) '''layer1_3 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 10, 10), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2))''' layer1_3 = LeNetConvPoolLayerNoPooling(params[3], input=layer1.output, image_shape=(batch_size, nkerns[1], 10, 10), filter_shape=(nkerns[2], nkerns[1], 3, 3)) layer1_4 = LeNetConvPoolLayer(params[2], input=layer1_3.output, image_shape=(batch_size, nkerns[2], 8, 8), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1_4.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(None, input=layer2_input, n_in=nkerns[3] * 3 * 3, n_out=1920, W=params[1][0], b=params[1][1], activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(params[0], input=layer2.output, n_in=1920, n_out=58) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model predict_model = theano.function([x], layer3.y_pred) #predict_model = theano.function([x], layer3.p_y_given_x) return predict_model
def predict(testNumber, dataset='dataset3.pkl', MEAN=True): """ An example of how to load a trained model and use it to predict labels. """ rng = numpy.random.RandomState(23455) finalSize = 200 index = T.lscalar() x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # load the saved model #layer4 = pickle.load(open('best_model.pkl')) basePath = r'C:\Users\Matt\Desktop\DogProj\data' f = open(os.path.join(basePath, 'best_model.pkl'), 'rb') layer4.W, layer4.b, layer3.W, layer3.b, layer2.W, layer2.b, layer1.W, layer1.b, layer0.W, layer0.b, validHolder, trainHolder = pickle.load( f) # print('blah') print(numpy.array(layer0.W.get_value())[3, 0, ...]) f.close() # compile a predictor function # We can test it on some examples from test test ##dataset='dataset3.pkl' dataset = os.path.join(basePath, dataset) datasets = load_data(dataset) test_set_x, test_set_y = datasets[2] valid_set_x, valid_set_y = datasets[1] print(numpy.array(valid_set_y)) #test_set_x = test_set_x.get_value() if MEAN == True: test_set_x.set_value( test_set_x.get_value(borrow=True) - numpy.mean(test_set_x.get_value(borrow=True))) test_set_x = test_set_x.get_value() layer0_input = x.reshape((testNumber, 1, 200, 200)) layer0new = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=layer0.image_shape, filter_shape=layer0.filter_shape, #5,5 before poolsize=(2, 2)) layer0new.W.set_value(layer0.W.get_value()) layer0new.b.set_value(layer0.b.get_value()) layer1new = LeNetConvPoolLayer(rng, input=layer0new.output, image_shape=layer1.image_shape, filter_shape=layer1.filter_shape, poolsize=(2, 2)) layer1new.W.set_value(layer1.W.get_value()) layer1new.b.set_value(layer1.b.get_value()) layer2new = LeNetConvPoolLayer(rng, input=layer1new.output, image_shape=layer2.image_shape, filter_shape=layer2.filter_shape, poolsize=(2, 2)) layer2new.W.set_value(layer2.W.get_value()) layer2new.b.set_value(layer2.b.get_value()) layer3_input = layer2new.output.flatten(2) layer3new = HiddenLayer( rng, input=layer3_input, n_in=layer3.n_in, n_out=layer3. n_out, #was 50, isn't this batch_size? nope no. hidden units activation=T.tanh) layer3new.W.set_value(layer3.W.get_value()) layer3new.b.set_value(layer3.b.get_value()) layer4new = LogisticRegression(rng, input=layer3new.output, n_in=layer3.n_out, n_out=2) layer4new.W.set_value(layer4.W.get_value()) layer4new.b.set_value(layer4.b.get_value()) test_model = theano.function([index], [ layer4new.y_pred, y, layer4new.p_y_given_x, x, layer0.W, layer1.W, layer2.W, test_set_y ], givens={ x: test_set_x[0:testNumber, ...], y: test_set_y[0:testNumber] }, on_unused_input='warn') print('test_set_y') predicted_values = test_model(1) print(numpy.array(predicted_values[7])) ''',y,p_y_given_x,x''' filter0_ = numpy.array(predicted_values[4])[0, 0, ...] filter0 = filter0_ / (abs(filter0_).max() / 255.0) filter01_ = numpy.array(predicted_values[4])[1, 0, ...] filter01 = filter01_ / (abs(filter01_).max() / 255.0) filter02_ = numpy.array(predicted_values[4])[2, 0, ...] filter02 = filter02_ / (abs(filter02_).max() / 255.0) filter03_ = numpy.array(predicted_values[4])[3, 0, ...] filter03 = filter03_ / (abs(filter03_).max() / 255.0) filter04_ = numpy.array(predicted_values[4])[4, 0, ...] filter04 = filter04_ / (abs(filter04_).max() / 255.0) filter05_ = numpy.array(predicted_values[4])[5, 0, ...] filter05 = filter05_ / (abs(filter05_).max() / 255.0) filter06_ = numpy.array(predicted_values[4])[6, 0, ...] filter06 = filter06_ / (abs(filter06_).max() / 255.0) filter07_ = numpy.array(predicted_values[4])[7, 0, ...] filter07 = filter07_ / (abs(filter07_).max() / 255.0) filter08_ = numpy.array(predicted_values[4])[8, 0, ...] filter08 = filter08_ / (abs(filter08_).max() / 255.0) filter09_ = numpy.array(predicted_values[4])[9, 0, ...] filter09 = filter09_ / (abs(filter09_).max() / 255.0) filter1_ = numpy.array(predicted_values[5])[0, 0, ...] filter1 = filter0_ / (abs(filter0_).max() / 255.0) filter11_ = numpy.array(predicted_values[5])[1, 0, ...] filter11 = filter11_ / (abs(filter11_).max() / 255.0) filter12_ = numpy.array(predicted_values[5])[2, 0, ...] filter12 = filter12_ / (abs(filter12_).max() / 255.0) filter13_ = numpy.array(predicted_values[5])[3, 0, ...] filter13 = filter13_ / (abs(filter13_).max() / 255.0) filter14_ = numpy.array(predicted_values[5])[4, 0, ...] filter14 = filter14_ / (abs(filter14_).max() / 255.0) filter15_ = numpy.array(predicted_values[5])[5, 0, ...] filter15 = filter15_ / (abs(filter15_).max() / 255.0) filter16_ = numpy.array(predicted_values[5])[6, 0, ...] filter16 = filter16_ / (abs(filter16_).max() / 255.0) filter17_ = numpy.array(predicted_values[5])[7, 0, ...] filter17 = filter17_ / (abs(filter17_).max() / 255.0) filter18_ = numpy.array(predicted_values[5])[8, 0, ...] filter18 = filter18_ / (abs(filter18_).max() / 255.0) filter19_ = numpy.array(predicted_values[5])[9, 0, ...] filter19 = filter19_ / (abs(filter19_).max() / 255.0) filter2_ = numpy.array(predicted_values[6])[0, 0, ...] filter2 = filter2_ / (abs(filter2_).max() / 255.0) filter21_ = numpy.array(predicted_values[6])[1, 0, ...] filter21 = filter21_ / (abs(filter21_).max() / 255.0) filter22_ = numpy.array(predicted_values[6])[2, 0, ...] filter22 = filter22_ / (abs(filter22_).max() / 255.0) filter23_ = numpy.array(predicted_values[6])[3, 0, ...] filter23 = filter23_ / (abs(filter23_).max() / 255.0) filter24_ = numpy.array(predicted_values[6])[4, 0, ...] filter24 = filter24_ / (abs(filter24_).max() / 255.0) filter25_ = numpy.array(predicted_values[6])[5, 0, ...] filter25 = filter25_ / (abs(filter25_).max() / 255.0) filter26_ = numpy.array(predicted_values[6])[6, 0, ...] filter26 = filter26_ / (abs(filter26_).max() / 255.0) filter27_ = numpy.array(predicted_values[6])[7, 0, ...] filter27 = filter27_ / (abs(filter27_).max() / 255.0) filter28_ = numpy.array(predicted_values[6])[8, 0, ...] filter28 = filter28_ / (abs(filter28_).max() / 255.0) filter29_ = numpy.array(predicted_values[6])[9, 0, ...] filter29 = filter29_ / (abs(filter29_).max() / 255.0) totFilter0 = numpy.hstack([ filter0, filter01, filter02, filter03, filter04, filter05, filter06, filter07, filter08, filter09 ]) totFilter1 = numpy.hstack([ filter1, filter11, filter12, filter13, filter14, filter15, filter16, filter17, filter18, filter19 ]) totFilter2 = numpy.hstack([ filter2, filter21, filter22, filter23, filter24, filter25, filter26, filter27, filter28, filter29 ]) totFilter = numpy.vstack([totFilter0, totFilter1, totFilter2]) #plt.imshow(totFilter, cmap = cm.Greys_r, interpolation='nearest') #plt.show() #print(layer3.output) print(test_set_x.shape) print("Predicted values for the first 2 examples in test set:") y = predicted_values[1] print(numpy.array(range(predicted_values[2].shape[0]))) #print([0:(predicted_values[2].shape[0])]) print( numpy.transpose(numpy.array(range( predicted_values[2].shape[0]))).shape) CountTrans = numpy.transpose( numpy.array(range(predicted_values[2].shape[0]))) print(CountTrans.shape[0]) CountTrans = CountTrans.reshape(CountTrans.shape[0], 1) #CountTrans.dimshuffle('x', 0) print(CountTrans.shape) predPrint = numpy.hstack([predicted_values[2], CountTrans]) print(predPrint) print(predicted_values[0]) print('test error = ' + str(sum(predicted_values[0] != y) / y.shape[0])) print('Actual values:') print(y) print('herehererererererererererererere') print(predicted_values[2][:, y[0]].shape) print(validHolder) print(predicted_values[2]) #.plot(validHolder) #plt.plot(trainHolder) #plt.show() return (predicted_values[2][:, y[0]], validHolder, predicted_values[2][testNumber - 1, y[testNumber - 1]])
def evaluate_lenet5(learning_rate=0.1, n_epochs=100, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) # datasets = load_data(dataset) # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] data = load_svmlight_file("./MachineLearning/DS3.libsvm") XA, testSetX, YA, testSetY = train_test_split(data[0], data[1], test_size=0.3, random_state=1) trainSetX, validSetX, trainSetY, validSetY = train_test_split( XA, YA, test_size=0.5, random_state=1) train_set_x, train_set_y = shared_dataset((trainSetX.toarray(), trainSetY)) valid_set_x, valid_set_y = shared_dataset((validSetX.toarray(), validSetY)) test_set_x, test_set_y = shared_dataset((testSetX.toarray(), testSetY)) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) p = params for k in range(len(p)): lenOfP = 0 evP = p[k].eval() lenOfP = checkLen(evP) print("W-B Count :", lenOfP, evP.shape)
def evaluate_lenet5( learning_rate=0.01, n_epochs=1, dataset='dataset3.pkl', nkerns=[20, 50, 50], batch_size=10, L1Value=0.00005, L2Value=0.0003 ): #nkerns should be 20,50,50, was 2,2,2 then 5,5,5 (slower cz more weights) """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ basePath = r'C:\Users\Matt\Desktop\DogProj\data' rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print('train set x') print(numpy.max(train_set_x.get_value(borrow=True))) print(numpy.min(train_set_x.get_value(borrow=True))) print((train_set_x.get_value(borrow=True)).sum() / ((train_set_x.get_value(borrow=True).shape[1]) * (train_set_x.get_value(borrow=True).shape[0]))) #for L in range(train_set_x.get_value(borrow=True).shape[1]): # train_set_x.set_value(train_set_x.get_value(borrow=True)[L,...]-numpy.mean(train_set_x.get_value(borrow=True)[L,...])) #train_set_x.set_value(train_set_x.get_value(borrow=True)-numpy.mean(train_set_x.get_value(borrow=True))) a = (train_set_x.get_value(borrow=True) > 0) #.astype(float) b = (train_set_x.get_value(borrow=True) < 0) #.astype(float) #train_set_x.set_value(a) valid_set_x.set_value( valid_set_x.get_value(borrow=True) - numpy.mean(valid_set_x.get_value(borrow=True))) a = (valid_set_x.get_value(borrow=True) > 0) #.astype(float) b = (valid_set_x.get_value(borrow=True) < 0) #.astype(float) #valid_set_x.set_value(a) test_set_x.set_value( test_set_x.get_value(borrow=True) - numpy.mean(test_set_x.get_value(borrow=True))) a = (test_set_x.get_value(borrow=True) > 0) #.astype(float) b = (test_set_x.get_value(borrow=True) < 0) #.astype(float) #test_set_x.set_value(a) print(numpy.max(train_set_x.get_value(borrow=True))) print(numpy.min(train_set_x.get_value(borrow=True))) print((train_set_x.get_value(borrow=True)).sum() / ((train_set_x.get_value(borrow=True).shape[1]) * (train_set_x.get_value(borrow=True).shape[0]))) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') finalSize = 200 # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, finalSize, finalSize)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, finalSize, finalSize), filter_shape=(nkerns[0], 1, 9, 9), #5,5 before poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 96, 96), filter_shape=(nkerns[1], nkerns[0], 9, 9), poolsize=(2, 2)) layer2 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 44, 44), filter_shape=(nkerns[2], nkerns[1], 9, 9), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer3_input = layer2.output.flatten(2) # construct a fully-connected sigmoidal layer layer3 = HiddenLayer( rng, input=layer3_input, n_in=nkerns[2] * 18 * 18, n_out=81, #was 50, isn't this batch_size? nope no. hidden units activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer4 = LogisticRegression(rng, input=layer3.output, n_in=81, n_out=2) # the cost we minimize during training is the NLL of the model cost = (layer4.negative_log_likelihood(y) + L2Value * (layer0.L2 + layer0.L2 + layer3.L2 + layer4.L2)) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer4.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params0 = layer3.params + layer2.params + layer1.params + layer0.params params1 = layer4.params # create a list of gradients for all model parameters grads0 = T.grad(cost, params0) grads1 = T.grad(cost, params1) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. ''' updates = [ for param_i, grad_i in zip(params0, grads0): (param_i, param_i - learning_rate * grad_i) for param_j, grad_j in zip(params1, grads1): (param_j, param_j - learning_rate/10 * grad_j) ] ''' updates = [] for param_i, grad_i in zip(params0, grads0): updates = updates + [(param_i, param_i - learning_rate * grad_i)] for param_j, grad_j in zip(params1, grads1): updates = updates + [(param_j, param_j - learning_rate / 5 * grad_j)] train_model = theano.function( [index], [ cost, layer4.p_y_given_x, layer4.y_pred, layer0.W, layer1.W, #5 layer2.W, layer3.W, layer4.W, layer0.output, layer4.b, layer4.p_y_given_x, #6 y, layer4.errors(y), layer0.preOutput, layer1.preOutput, layer2.preOutput, #5 layer0.output, layer2.output, layer3.preOutput, layer4.preOutput, #4 layer4.W, layer4.b, layer4.input, test_set_y, #4 layer0.b, layer1.b, layer2.b, layer3.b, layer4.b ], #5 updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ''', mode=theano.compile.MonitorMode( pre_func=inspect_inputs, post_func=inspect_outputs)''' # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 500 #10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.9995 #0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False counter3 = 0 counter4 = 0 filterHolder = [] validHolder = [] trainHolder = [] costHolder = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 costHolder = [] for minibatch_index in range(int(n_train_batches)): iter = (epoch - 1) * n_train_batches + minibatch_index print(iter) print(epoch) if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) print('bbbb') print((layer4.W)) print(numpy.array(cost_ij[7])) print(cost_ij[7].shape) print('test set y') print(numpy.array(cost_ij[23])) print() #print(cost_ij) #print(numpy.array(cost_ij[3])[0,0,...]) #print(numpy.array(cost_ij[3])[1,0,...]) #print('shape') #print(numpy.array(cost_ij[3])[0,0,...]) filter0_ = numpy.array(cost_ij[3])[0, 0, ...] filter0 = filter0_ / (filter0_.max() / 255.0) #filter0 = filter0_ filter01_ = numpy.array(cost_ij[3])[1, 0, ...] filter01 = filter01_ / (filter01_.max() / 255.0) #filter0 = filter0_ #print('filter0.shape') #print(filter0.shape) filter1_ = numpy.array(cost_ij[4])[0, 0, ...] filter1 = filter1_ / (filter1_.max() / 255.0) #filter1 =filter1_ filter11_ = numpy.array(cost_ij[4])[1, 0, ...] filter11 = filter11_ / (filter11_.max() / 255.0) #filter1 =filter1_ filter2_ = numpy.array(cost_ij[5])[0, 0, ...] filter2 = filter2_ / (filter2_.max() / 255.0) filter21_ = numpy.array(cost_ij[5])[1, 0, ...] filter21 = filter21_ / (filter21_.max() / 255.0) #filter2 =filter2_ hiddenW_ = numpy.array(cost_ij[6])[:, 0] hiddenW = hiddenW_ / (hiddenW_.max() / 255.0) #hiddenW=hiddenW_ logRWT_ = numpy.array(cost_ij[7])[:, 0] logRWT = logRWT_ / (logRWT_.max() / 255.0) #logRWT=logRWT_ logRWF_ = numpy.array(cost_ij[7])[:, 1] logRWF = logRWF_ / (logRWF_.max() / 255.0) #logRWF=logRWF_ hiddenW = numpy.reshape(hiddenW[0:81], [9, 9]) logRWT = numpy.reshape(logRWT[0:81], [9, 9]) #was [5,5] logRWF = numpy.reshape(logRWF[0:81], [9, 9]) #gradientP = numpy.array(cost_ij[7]) #print('shapes') #print(hiddenW.shape) #print(logRWT.shape) #filter1.resize([filter0.shape[0],filter0.shape[1]]) #print(filter1) #print() filter1 = numpy.vstack([ filter1, numpy.zeros( [filter0.shape[0] - filter1.shape[0], filter1.shape[1]]) ]) filter1 = numpy.hstack([ filter1, numpy.zeros( [filter1.shape[0], filter0.shape[1] - filter1.shape[1]]) ]) filter2 = numpy.vstack([ filter2, numpy.zeros( [filter0.shape[0] - filter2.shape[0], filter2.shape[1]]) ]) filter2 = numpy.hstack([ filter2, numpy.zeros( [filter2.shape[0], filter0.shape[1] - filter2.shape[1]]) ]) hiddenW = numpy.vstack([ hiddenW, numpy.zeros( [filter0.shape[0] - hiddenW.shape[0], hiddenW.shape[1]]) ]) hiddenW = numpy.hstack([ hiddenW, numpy.zeros( [hiddenW.shape[0], filter0.shape[1] - hiddenW.shape[1]]) ]) logRWT = numpy.vstack([ logRWT, numpy.zeros( [filter0.shape[0] - logRWT.shape[0], logRWT.shape[1]]) ]) logRWT = numpy.hstack([ logRWT, numpy.zeros( [logRWT.shape[0], filter0.shape[1] - logRWT.shape[1]]) ]) logRWF = numpy.vstack([ logRWF, numpy.zeros( [filter0.shape[0] - logRWF.shape[0], logRWF.shape[1]]) ]) logRWF = numpy.hstack([ logRWF, numpy.zeros( [logRWF.shape[0], filter0.shape[1] - logRWF.shape[1]]) ]) totFilter = numpy.hstack( [filter0, filter1, filter2, hiddenW, logRWT, logRWF]) totlayer2 = numpy.hstack( [filter01, filter11, filter21, numpy.zeros([9, 3 * 9])]) totFilter = numpy.vstack([totFilter, totlayer2]) ''' print('preOutput1') print(numpy.mean(abs(numpy.array(cost_ij[13])))) print(numpy.mean(abs(numpy.array(cost_ij[14])))) print(numpy.mean(abs(numpy.array(cost_ij[15])))) print('preOutput3') print(numpy.mean(abs(numpy.array(cost_ij[18])))) print('postlayer0') print(numpy.mean(abs(numpy.array(cost_ij[16])))) print('postlayer2') print(numpy.mean(abs(numpy.array(cost_ij[17])))) print('preOutput4') print((numpy.array(cost_ij[19]))) print('layer4W,layer4B,layer4input') #print((numpy.array(cost_ij[20]))) #print((numpy.array(cost_ij[21]))) print((numpy.array(cost_ij[22]))) ''' #totFilter = numpy.array(cost_ij[8][0,0,...]) #print(filter0) #plt.imshow(filter0, cmap = cm.Greys_r,interpolation="nearest") #plt.show() filterHolder.append(totFilter) #=filter0 costHolder.append(numpy.mean(cost_ij[12])) if iter > 1: a = 1 ''' #print(filterHolder[int(iter-1)][0]) print('abs values') ##print(filter0_) ##print(filter1_) ##print(numpy.reshape(hiddenW_[0:81],[9,9])) print(numpy.reshape(logRWT_[0:81],[9,9])) print(numpy.array(cost_ij[9]).shape) print((numpy.array(cost_ij[9])[0])) print('end abs values') print('p_y_given_x') print(numpy.array(cost_ij[10])) print(numpy.array(cost_ij[11])) print(iter) #print(len(filterHolder)) ##print(filterHolder[int(iter)][0:9,0:9]-filterHolder[int(iter)-1][0:9,0:9]) ##print(filterHolder[int(iter)][0:9,9:18]-filterHolder[int(iter)-1][0:9,9:18]) #print(filterHolder[int(iter-1)][0:9,0:9].shape) #print(filterHolder[int(iter-1)][2].shape) #print(filterHolder[int(iter-1)][3].shape) ''' print('filterHolders') print( numpy.mean(filterHolder[int(iter)][0:9, 0:9] - filterHolder[int(iter) - 1][0:9, 0:9])) print( numpy.mean(filterHolder[int(iter)][0:9, 9:18] - filterHolder[int(iter) - 1][0:9, 9:18])) print( numpy.mean(filterHolder[int(iter)][0:9, 18:27] - filterHolder[int(iter) - 1][0:9, 18:27])) print( numpy.mean(filterHolder[int(iter)][0:9, 36:45] - filterHolder[int(iter) - 1][0:9, 36:45])) print('cost') print(numpy.array(cost_ij[0])) print('p_y_given_x') print(numpy.array(cost_ij[10])) print((numpy.array(cost_ij[11]))) counter4 += 1 ''' print('layer4.Wb') print(layer4.W.get_value()) print(layer4.b.get_value()) print('layer3.Wb') print(layer3.W.get_value()) print(layer3.b.get_value()) print('layer2.Wb') print(layer2.W.get_value()) print(layer2.b.get_value()) print('layer1.Wb') print(layer1.W.get_value()) print(layer1.b.get_value()) print('layer0.Wb') print(layer0.W.get_value()) print(layer0.b.get_value()) ''' #print(layer4.input.eval()) #print(layer4.p_y_given_x.eval({'input':layer4.input,'SelfW':layer4.W,'SelfB':layer4.b})) #x_printed = theano.printing.Print('this is a very important value')(x) #f = theano.function([x], x * 5) #f_with_print = theano.function([x], x_printed * 5) #assert numpy.all( f_with_print([1, 2, 3]) == [5, 10, 15]) if (iter + 1) % validation_frequency == 0: if (counter3) % 5 == 0: a = 1 ''' filter01_ = numpy.array(cost_ij[3])[2,0,...] filter01 = filter01_/(filter01_.max()) filterHolder.append(filter01) totFilter = numpy.array(cost_ij[8][0,2,...]) filterHolder.append(totFilter) totFilter = numpy.array(cost_ij[8][1,2,...]) filterHolder.append(totFilter) totFilter = numpy.array(cost_ij[8][2,2,...]) filterHolder.append(totFilter) totFilter = numpy.array(cost_ij[8][3,2,...]) filterHolder.append(totFilter) totFilter = numpy.array(cost_ij[8][4,2,...]) filterHolder.append(totFilter) ''' #totFilter = numpy.array(cost_ij[8][5,0,...]) #filterHolder.append(totFilter) counter3 += 1 # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(int(n_valid_batches)) ] this_validation_loss = numpy.mean(validation_losses) validHolder.append(this_validation_loss) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter best_params = params0 + params1 # test it on the test set test_losses = [ test_model(i) for i in range(int(n_test_batches)) ] test_score = numpy.mean((test_losses)) print(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open(os.path.join(basePath, 'best_modelWEB.pkl'), 'wb') as f: pickle.dump([ numpy.array(cost_ij[7]), numpy.array(cost_ij[28]), numpy.array(cost_ij[6]), numpy.array(cost_ij[27]), numpy.array(cost_ij[5]), numpy.array(cost_ij[26]), numpy.array(cost_ij[4]), numpy.array(cost_ij[25]), numpy.array(cost_ij[3]), numpy.array(cost_ij[24]), validHolder, trainHolder ], f) if iter > 150000: break if patience <= iter: a = 1 #done_looping = True #break trainHolder.append(sum(costHolder) / len(costHolder)) print('TrainHolder : ') print(trainHolder) with open(os.path.join(basePath, 'final_modelWEB.pkl'), 'wb') as f: pickle.dump([ numpy.array(cost_ij[7]), numpy.array(cost_ij[28]), numpy.array(cost_ij[6]), numpy.array(cost_ij[27]), numpy.array(cost_ij[5]), numpy.array(cost_ij[26]), numpy.array(cost_ij[4]), numpy.array(cost_ij[25]), numpy.array(cost_ij[3]), numpy.array(cost_ij[24]), validHolder, trainHolder ], f) end_time = timeit.default_timer() print('Optimization complete.') print((params0 + params1)) print('Valid Holder') print(validHolder) print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) fig = plt.figure() # make figure im = plt.imshow(filterHolder[0], cmap=cm.Greys_r, interpolation="nearest") def updatefig(j): # set the data in the axesimage object im.set_array(filterHolder[j]) print(j) #print(filterHolder[j]) # return the artists set return im, ani = animation.FuncAnimation(fig, updatefig, frames=len(filterHolder), interval=10, blit=True, repeat=True) #plt.imshow(filterHolder[0], cmap = cm.Greys_r, interpolation="nearest") #print(filterHolder) plt.show()
class MLPRanker(object): def __init__(self, verbose=True): if verbose: logger.debug('Build Multilayer Perceptron Ranking model...') # Positive input setting self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative input setting self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Standard input setting self.inputL = T.matrix(name='inputL', dtype=floatX) self.inputR = T.matrix(name='inputR', dtype=floatX) # Build activation function self.act = Activation('tanh') # Connect input matrices self.inputP = T.concatenate([self.inputPL, self.inputPR], axis=1) self.inputN = T.concatenate([self.inputNL, self.inputNR], axis=1) self.input = T.concatenate([self.inputL, self.inputR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.input, (2*edim, args.hidden), act=self.act) self.hidden = self.hidden_layer.output self.hiddenP = self.hidden_layer.encode(self.inputP) self.hiddenN = self.hidden_layer.encode(self.inputN) # Dropout parameter - test self.thidden = (1-args.dropout) * self.hidden self.thiddenP = (1-args.dropout) * self.hiddenP self.thiddenN = (1-args.dropout) * self.hiddenN # Dropout parameter - train srng = T.shared_randomstreams.RandomStreams(args.seed) mask = srng.binomial(n=1, p=1-args.dropout, size=self.hidden.shape) maskP = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenP.shape) maskN = srng.binomial(n=1, p=1-args.dropout, size=self.hiddenN.shape) self.hidden *= T.cast(mask, floatX) self.hiddenP *= T.cast(maskP, floatX) self.hiddenN *= T.cast(maskN, floatX) # Build linear output layer self.score_layer = ScoreLayer(self.hidden, args.hidden) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.hiddenP) self.scoreN = self.score_layer.encode(self.hiddenN) # Build for test self.toutput = self.score_layer.encode(self.thidden) self.tscoreP = self.score_layer.encode(self.thiddenP) self.tscoreN = self.score_layer.encode(self.thiddenN) # Stack all the parameters self.params = [] self.params += self.hidden_layer.params self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0-self.scoreP+self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Count the total number of parameters in this model self.num_params = edim * args.hidden + args.hidden + args.hidden + 1 # Build class method self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.toutput) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.tscoreP, self.tscoreN]) if verbose: logger.debug('Architecture of MLP Ranker built finished, summarized below: ') logger.debug('Input dimension: %d' % edim) logger.debug('Hidden dimension: %d' % args.hidden) logger.debug('Total number of parameters used in the model: %d' % self.num_params) def update_params(self, grads, learn_rate): for param, grad in zip(self.params, grads): p = param.get_value(borrow=True) param.set_value(p - learn_rate * grad, borrow=True) @staticmethod def save(fname, model): with file(fname, 'wb') as fout: cPickle.dump(model, fout) @staticmethod def load(fname): with file(fname, 'rb') as fin: model = cPickle.load(fin) return model
def evaluate_transfer_lenet5( learning_rate=0.1, alpha=1, n_epochs=20, source_dataset='../data/resize_mnist_whiten.pkl.gz', target_dataset='../data/usps_whiten.pkl.gz', training_dataset='../data/shuffled_training_data_big_new.pkl.gz', nkerns=[20, 50], batch_size=4000): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_source_data(source_dataset) target_datasets = load_target_data(target_dataset) transfer_training_datasets = load_transfer_training_data(training_dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] target_set_x, target_set_y = target_datasets transfer_training_set_x, transfer_training_set_y = transfer_training_datasets # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_target_batches = target_set_x.get_value(borrow=True).shape[0] n_transfer_training_batches = transfer_training_set_x.get_value( borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size n_target_batches /= batch_size n_transfer_training_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels y_in = T.ivector('y_in') ishape = (16, 16) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 16, 16)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 16, 16), filter_shape=(nkerns[0], 1, 3, 3), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 7, 7), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 2 * 2, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) x_prob = layer3.py_given_x() # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params marginal_params = layer1.params + layer0.params # calculate marginal MMD and its gradients # todo: # q1:最外层为什么是mean而不是2范数, # q2:论文给人的感觉是l-1层是layer2, 代码中却是layer1 marginal_MMD = T.mean( T.mean(layer2_input[T.arange(batch_size / 2)]) - T.mean(layer2_input[T.arange(batch_size / 2, batch_size, 1)])) marginal_grads = T.grad(T.dot(marginal_MMD, marginal_MMD), marginal_params) # the cost we minimize during training is the NLL of the model lost_cost = layer3.negative_log_likelihood(y) # calculate conditional MMD conditional_cost_all = T.mean(x_prob[0:batch_size/2:1,0:10:1],axis = 0)\ - T.mean(x_prob[batch_size/2:batch_size:1,0:10:1],axis = 0) conditional_cost = T.dot(conditional_cost_all, conditional_cost_all) #add classification loss and conditional MMD all together cost = lost_cost + 100 * conditional_cost # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) target_model = theano.function( [index], layer3.errors(y), givens={ x: target_set_x[index * batch_size:(index + 1) * batch_size], y: target_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. # todo: 为何损失函数将marginal_MMD和cost分开,梯度同时也是分开更新 marginal_updates = [] for param_i, marginal_grad_i in zip(marginal_params, marginal_grads): marginal_updates.append( (param_i, param_i - 100 * learning_rate * marginal_grad_i)) updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # todo: 用最新的预测结果替换目标域标签代码在哪里? transfer_model = theano.function( [index], [cost, conditional_cost], updates=updates, givens={ x: transfer_training_set_x[index * batch_size:(index + 1) * batch_size], y: transfer_training_set_y[index * batch_size:(index + 1) * batch_size] }) marginal_model = theano.function( [index], marginal_MMD, updates=marginal_updates, givens={ x: transfer_training_set_x[index * batch_size:(index + 1) * batch_size] }) y_out = layer3.get_output() target_predict = theano.function( [index], y_out, givens={x: target_set_x[index * batch_size:(index + 1) * batch_size]}) update_target_training_label = theano.function( [index], y_out, #updates=label_updates, givens={ x: transfer_training_set_x[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant batch_number = n_transfer_training_batches #n_train_batches validation_frequency = min(batch_number, patience / 2) update_frequency = min(batch_number, patience / 2) * 10 # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(batch_number): iter = (epoch - 1) * batch_number + minibatch_index cost = transfer_model(minibatch_index) MMD_margin = marginal_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, batch_number, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of the ' 'model %f %%') % (epoch, minibatch_index + 1, batch_number, test_score * 100.)) target_losses = [ target_model(i) for i in xrange(n_target_batches) ] target_score = numpy.mean(target_losses) print( (' epoch %i, minibatch %i/%i, target error of the ' 'model %f %%') % (epoch, minibatch_index + 1, n_transfer_training_batches, target_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') #print('Best validation score of %f %% obtained at iteration %i,'\ # 'with test performance %f %%' % # (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) noisy_y = target_set_y.eval() for i in xrange(n_target_batches): noisy_y[i * batch_size:(i + 1) * batch_size] = target_predict(i) resizeValidSet = [target_set_x, noisy_y] fw = gzip.open("../data/predict_data_CNN_new.pkl.gz", 'wb') cPickle.dump(resizeValidSet, fw) fw.close()
def evaluate_lenet5(datasets, learning_seed=0.01, n_epochs=500, batch_size=250, save_folder='./cache', channel_count=1): """ Evaluate a convnet for three dimensional image inputs. :type learning_seed: float :param learning_seed: learning rate used (factor for the stochastic gradient) during initialization. :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type batch_size: integer :param batch_size: size for batched testing :type channel_count: integer :param channel_count: number of channels per image """ rng = numpy.random.RandomState(23455) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch new_rate = T.lscalar() # The learning rate. # start-snippet-1 r = T.dscalar('r') # the learning rate as a variable. x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape # (batch_size, channel_count, 32 * 32) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (32, 32) is the size of CIFAR images. layer0_input = x.reshape((batch_size, channel_count, 32, 32)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (32+2-3+1 , 32+2-3+1) = (32, 32) # maxpooling reduces this further to (32/2, 32/2) = (16, 16) # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, channel_count, 32, 32), filter_shape=(128, channel_count, 3, 3), poolsize=(1, 1) ) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, 128, 32, 32), filter_shape=(128, 128, 3, 3), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (16+2-3+1, 16+2-3+1) = (16, 16) # maxpooling reduces this further to (16/2, 16/2) = (8, 8) # 4D output tensor is thus of shape (batch_size, nkerns[1], 8, 8) layer2 = LeNetConvPoolLayer( rng, input=layer1.output, image_shape=(batch_size, 128, 16, 16), filter_shape=(256, 128, 3, 3), poolsize=(1, 1) ) layer3 = LeNetConvPoolLayer( rng, input=layer2.output, image_shape=(batch_size, 256, 16, 16), filter_shape=(256, 256, 3, 3), poolsize=(1, 1) ) layer4 = LeNetConvPoolLayer( rng, input=layer3.output, image_shape=(batch_size, 256, 16, 16), filter_shape=(256, 256, 3, 3), poolsize=(1, 1) ) layer5 = LeNetConvPoolLayer( rng, input=layer4.output, image_shape=(batch_size, 256, 16, 16), filter_shape=(256, 256, 3, 3), poolsize=(2, 2) ) # Construct the third convolutional pooling layer # filtering reduces the image size to (8+2-3+1, 8+2-3+1) = (8, 8) # No maxpooling (aka maxpooling (1, 1)) # 4D output tensor is thus of shape (batch_size, nkerns[1], 8, 8) layer6 = LeNetConvPoolLayer( rng, input=layer5.output, image_shape=(batch_size, 256, 8, 8), filter_shape=(512, 256, 3, 3), poolsize=(1, 1) ) layer7 = LeNetConvPoolLayer( rng, input=layer6.output, image_shape=(batch_size, 512, 8, 8), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) layer8 = LeNetConvPoolLayer( rng, input=layer7.output, image_shape=(batch_size, 512, 8, 8), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) layer9 = LeNetConvPoolLayer( rng, input=layer8.output, image_shape=(batch_size, 512, 8, 8), filter_shape=(512, 512, 3, 3), poolsize=(2, 2) ) # Construct the third convolutional pooling layer # filtering reduces the image size to (8+2-3+1, 8+2-3+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer10 = LeNetConvPoolLayer( rng, input=layer9.output, image_shape=(batch_size, 512, 4, 4), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) layer11 = LeNetConvPoolLayer( rng, input=layer10.output, image_shape=(batch_size, 512, 4, 4), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) layer12 = LeNetConvPoolLayer( rng, input=layer11.output, image_shape=(batch_size, 512, 4, 4), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) layer13 = LeNetConvPoolLayer( rng, input=layer12.output, image_shape=(batch_size, 512, 4, 4), filter_shape=(512, 512, 3, 3), poolsize=(1, 1) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (500, 512 * 4 * 4) = (500, 8192) with the default values. layer14_input = layer13.output.flatten(2) # construct a fully-connected sigmoidal layer layer14 = HiddenLayer( rng, input=layer14_input, n_in=512 * 4 * 4, n_out=2048, activation=relu ) layer15 = HiddenLayer( rng, input=layer14.output, n_in=2048, n_out=1024, activation=relu ) # classify the values of the fully-connected sigmoidal layer # there are 10 labels in total. layer16 = HiddenLayer( rng, input=layer15.output, n_in=1024, n_out=10, activation=relu ) # the cost we minimize during training is the NLL of the model cost = layer16.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer16.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer16.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer16.params + \ layer15.params + \ layer14.params + \ layer13.params + \ layer12.params + \ layer11.params + \ layer10.params + \ layer9.params + \ layer8.params + \ layer7.params + \ layer6.params + \ layer6.params + \ layer5.params + \ layer4.params + \ layer3.params + \ layer2.params + \ layer1.params + \ layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - r * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index, new_rate], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size], r: new_rate } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 cur_learning_rate = learning_seed test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index, cur_learning_rate) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) else: # Did not get a new best validation score. cur_learning_rate /= 10 if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=100, hidden_layers_size=None, n_outs=1, L1_reg=0.00, L2_reg=0.0001): if hidden_layers_size is None: hidden_layers_size = [100, 100] self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_size) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) self.x = T.matrix('x') self.y = T.vector('y') for i in range(self.n_layers): if i == 0: input_sizes = n_ins else: input_sizes = hidden_layers_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_sizes, n_out=hidden_layers_size[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_sizes, n_hidden=hidden_layers_size[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.linearRegressionLayer = LinearRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_size[-1], n_out=n_outs) self.L1 = abs(self.sigmoid_layers[-1].W).sum() + abs( self.linearRegressionLayer.W).sum() self.L2_sqr = (self.sigmoid_layers[-1].W** 2).sum() + (self.linearRegressionLayer.W**2).sum() self.squared_errors = self.linearRegressionLayer.squared_errors(self.y) self.finetune_cost = self.squared_errors + L1_reg * self.L1 + L2_reg * self.L2_sqr self.y_pred = self.linearRegressionLayer.p_y_given_x self.params = self.params + self.linearRegressionLayer.params
def test_SdA(finetune_lr=0.1, pretraining_epochs=0, pretrain_lr=0.05, training_epochs=100, dataset='mnist.pkl.gz', batch_size=10): """ Demonstrates how to train and test a stochastic denoising autoencoder. This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used in the finetune stage (factor for the stochastic gradient) :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type n_iter: int :param n_iter: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] train_set_x=train_set_x.eval() train_set_y=train_set_y.eval() train_set_x_lab=train_set_x[:1000,:] train_set_x_unlab=train_set_x[1000:,:] train_set_y_lab=train_set_y[:1000] train_set_y_unlab=train_set_y[1000:] import theano train_set_x_lab=theano.shared(numpy.asarray(train_set_x_lab, dtype=theano.config.floatX), borrow=True) train_set_y_lab=theano.shared(numpy.asarray(train_set_y_lab, dtype=theano.config.floatX), borrow=True) train_set_y_lab=T.cast(train_set_y_lab, 'int32') train_set_x_unlab=theano.shared(numpy.asarray(train_set_x_unlab, dtype=theano.config.floatX), borrow=True) train_set_y_unlab=theano.shared(numpy.asarray(train_set_y_unlab, dtype=theano.config.floatX), borrow=True) train_set_y_unlab=T.cast(train_set_y_unlab, 'int32') # compute number of minibatches for training, validation and testing n_train_batches = train_set_y_lab.eval().shape[0] n_train_batches /= batch_size n_train_batches_u = train_set_y_unlab.eval().shape[0] n_train_batches_u /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) print '... building the model' # construct the stacked denoising autoencoder class hidden_layer_size = 100 sda = SdA( numpy_rng=numpy_rng, n_ins=28 * 28, hidden_layers_sizes=[100], n_outs=10 ) # end-snippet-3 start-snippet-4 ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x_unlab, batch_size=batch_size) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise corruption_levels = [0.1, 0.2, 0.3] for i in xrange(sda.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i],##$ lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) fprop = theano.function( [], sda.output, givens={ sda.x: test_set_x }, name='fp' ) Q=fprop() print 'rec', ((Q-test_set_x.eval())**2).mean() from utils import tile_raster_images,plot_weights import PIL.Image as Image image = Image.fromarray( tile_raster_images(X=sda.dA_layers[0].W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') # end-snippet-4 ######################## # FINETUNING THE MODEL FOR REGRESSION # if 0 : # pretrain middle layer print '... pre-training MIDDLE layer' h1 = T.matrix('x') # the data is presented as rasterized images h2 = T.matrix('y') # the labels are presented as 1D vector of log_reg = HiddenLayer(numpy_rng, h1, hidden_layer_size, hidden_layer_size) if 1: # for middle layer learning_rate = 0.05 fprop_inp = theano.function( [], SdA_inp.sigmoid_layers[-1].output, givens={ SdA_inp.sigmoid_layers[0].input: train_set_x }, name='fprop_inp' ) fprop_out = theano.function( [], SdA_out.sigmoid_layers[-1].output, givens={ SdA_out.sigmoid_layers[0].input: train_set_y }, name='fprop_out' ) H11=fprop_inp() H21=fprop_out() H1=N1.predict(train_set_x.eval()) H2=N2.predict(train_set_y.eval()) H1=theano.shared(H1) H2=theano.shared(H2) # compute the gradients with respect to the model parameters logreg_cost = log_reg.mse(h2) gparams = T.grad(logreg_cost, log_reg.params) # compute list of fine-tuning updates updates = [ (param, param - gparam * learning_rate) for param, gparam in zip(log_reg.params, gparams) ] train_fn_middle = theano.function( inputs=[], outputs=logreg_cost, updates=updates, givens={ h1: H1, h2: H2 }, name='train_middle' ) epoch = 0 while epoch < 10: print epoch, train_fn_middle() epoch += 1 from mlp import MLP net = MLP(numpy_rng, train_set_x_lab, 28*14, hidden_layer_size, 28*14, W1=sda.dA_layers[0].W, b1=sda.dA_layers[0].b, W2=None, b2=None) ######################## ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, validate_model, test_model = sda.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr ) print '... finetunning the model' # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :learning_rate: 梯度下降法的学习率 :n_epochs: 最大迭代次数 :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :nkerns: 每个卷积层的卷积核个数,第一层卷积核个数为 nkerns[0]=20,第二层卷积核个数 为50个 """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) # 加载训练数据,训练数据包含三个部分 train_set_x, train_set_y = datasets[0] # 训练数据 valid_set_x, valid_set_y = datasets[1] # 验证数据 test_set_x, test_set_y = datasets[2] # 测试数据 # 计算批量训练可以分多少批数据进行训练,这个只要是知道批量训练的人都知道 n_train_batches = train_set_x.get_value(borrow=True).shape[0] # 训练数据个数 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size # 批数 n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) '''''构建第一层网络: image_shape:输入大小为28*28的特征图,batch_size个训练数据,每个训练数据有1个特征图 filter_shape:卷积核个数为nkernes[0]=20,因此本层每个训练样本即将生成20个特征图 经过卷积操作,图片大小变为(28-5+1 , 28-5+1) = (24, 24) 经过池化操作,图片大小变为 (24/2, 24/2) = (12, 12) 最后生成的本层image_shape为(batch_size, nkerns[0], 12, 12)''' layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) '''''构建第二层网络:输入batch_size个训练图片,经过第一层的卷积后,每个训练图片有nkernes[0]个特征图,每个特征图 大小为12*12 经过卷积后,图片大小变为(12-5+1, 12-5+1) = (8, 8) 经过池化后,图片大小变为(8/2, 8/2) = (4, 4) 最后生成的本层的image_shape为(batch_size, nkerns[1], 4, 4)''' layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) '''''全链接:输入layer2_input是一个二维的矩阵,第一维表示样本,第二维表示上面经过卷积下采样后 每个样本所得到的神经元,也就是每个样本的特征,HiddenLayer类是一个单层网络结构 下面的layer2把神经元个数由800个压缩映射为500个''' layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # 最后一层:逻辑回归层分类判别,把500个神经元,压缩映射成10个神经元,分别对应于手写字体的0~9 layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # 把所有的参数放在同一个列表里,可直接使用列表相加 params = layer3.params + layer2.params + layer1.params + layer0.params # 梯度求导 grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): # 每一批训练数据 cost_ij = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, config=None, verbose=True): # Construct two GrCNNEncoders for matching two sentences self.encoderL = ExtGrCNNEncoder(config, verbose) self.encoderR = ExtGrCNNEncoder(config, verbose) # Link the parameters of two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Build three kinds of inputs: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This part is used for training positive pairs # 3, inputNL, inputNR. This part is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Linking input-output mapping self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=1) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer(self.hidden, (2*config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1-config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layers self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean(T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class methods self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams+[self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) self.show_inputs = theano.function(inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR]) if verbose: logger.debug('Architecture of ExtGrCNNMatchScorer built finished, summarized below: ') logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension inside GrCNNMatchScorer pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension MLP: %d' % config.num_mlp) logger.debug('Number of Gating functions: %d' % config.num_gates) logger.debug('There are 2 ExtGrCNNEncoders used in model.') logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def cnn(pre_run,kind, PV, true_out ,learning_rate=0.1, n_epochs=200, datasets='mnist.pkl.gz',batch_size=100, path="", name="",input_layer={}, hidden={},ConvPool={},out_layer={}): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] array_PV = PV PV = theano.shared(value=0.5*numpy.ones(PV.shape,dtype="float32"),borrow=True) true_out = theano.shared(value=true_out,borrow=True) assert PV.get_value().shape[0] == train_set_x.get_value().shape[0] z1 = T.matrix('z1') z2 = T.matrix('z2') # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, int(input_layer['channel']), int(input_layer['width']), int(input_layer['height']))) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) # # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) # # CP is a list storing the ConvPool layer. CP = [] for i in xrange(len(ConvPool)): tem = 'ConvPool'+ str(i) if i == 0: activation = None if int(ConvPool[tem]['activation']) == 1: activation = T.nnet.sigmoid if int(ConvPool[tem]['activation']) == 2: activation = T.tanh CP.append(LeNetConvPoolLayer( rng, activation = activation, input=layer0_input, image_shape=( batch_size, int(ConvPool[tem]['channel']), int(ConvPool[tem]['width']), int(ConvPool[tem]['height'])), filter_shape=(int(ConvPool[tem]['filters']), int(ConvPool[tem]['channel']), int(ConvPool[tem]['filter_width']), int(ConvPool[tem]['filter_height'])), poolsize=( int(ConvPool[tem]['pool_width']), int(ConvPool[tem]['pool_height'])))) if i != 0: activation = None if int(ConvPool[tem]['activation']) == 1: activation = T.nnet.sigmoid if int(ConvPool[tem]['activation']) == 2: activation = T.tanh CP.append(LeNetConvPoolLayer( rng, activation = activation, input=CP[-1].output, image_shape=( batch_size, int(ConvPool[tem]['channel']), int(ConvPool[tem]['width']), int(ConvPool[tem]['height'])), filter_shape=(int(ConvPool[tem]['filters']), int(ConvPool[tem]['channel']), int(ConvPool[tem]['filter_width']), int(ConvPool[tem]['filter_height'])), poolsize=( int(ConvPool[tem]['pool_width']), int(ConvPool[tem]['pool_height'])))) ConvPool_output = CP[-1].output.flatten(2) # construct a fully-connected sigmoidal layer # HL is a list storing the Hidden layer. HL = [] for i in xrange(len(hidden)): ite = len(ConvPool) + i tem = 'hidden_layer_'+ str(ite) if ite == len(ConvPool): activation = None if int(hidden[tem]['activation']) == 1: activation = T.nnet.sigmoid if int(hidden[tem]['activation']) == 2: activation = T.tanh HL.append( HiddenLayer(rng, input=ConvPool_output, n_in =int(hidden[tem]['n_in']), n_out=int(hidden[tem]['n_out']), activation=activation)) if ite != len(ConvPool): activation = None if int(hidden[tem]['activation']) == 1: activation = T.nnet.sigmoid if int(hidden[tem]['activation']) == 2: activation = T.tanh HL.append( HiddenLayer(rng, input=HL[-1].output, n_in =int(hidden[tem]['n_in']), n_out=int(hidden[tem]['n_out']), activation=activation)) hidden_output = HL[-1].output # classify the values of the fully-connected output layer OutLayer = HiddenLayer(rng=rng, \ input=hidden_output, \ n_in=int(out_layer['n_in']), \ n_out=int(out_layer['n_out']), \ activation=T.nnet.sigmoid, \ kind=2) # the cost we minimize during training is the NLL of the model cost = OutLayer.sq_loss(z1,z2) y_x = OutLayer.output # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [OutLayer.errors(y),y_x], givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], OutLayer.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = OutLayer.params tem = len(HL) for i in xrange(len(HL)): params += HL[tem-1].params tem = tem -1 tem = len(CP) for i in xrange(len(CP)): params += CP[tem-1].params tem = tem -1 # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], [ cost,y_x], updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], z1: PV[index * batch_size: (index + 1) * batch_size], z2: true_out[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False Hpy_out = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 if epoch == pre_run: PV.set_value(array_PV) for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter tem = train_model(minibatch_index) cost_ij = tem[0] if epoch == n_epochs: Hpy_out.append(tem[1]) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i)[0] for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) #if patience <= iter: # done_looping = True # break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # save the test result and return the train Hpy after train finished. test_out = [] for minibatch_index in xrange(n_test_batches): tem = test_model(minibatch_index) test_out.append(tem[1]) test_tem = numpy.asarray(test_out).reshape((n_test_batches * batch_size, \ int(out_layer['n_out']))) cPickle.dump(test_tem,open("./config-example/test_tem/"+name+".pkl","wb")) return Hpy_out