def eval(input_x,input_y,test_x,test,label,write_folder = None): tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters) grid_clf.fit(input_x,input_y) print "params : \t" print grid_clf.get_params() result = grid_clf.predict(test_x) #py_weka = python_weka(input_x,input_y,label) #py_weka.train() #result = py_weka.predict(test_x) #py_weka.close() #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(input_x,input_y) #result = clf.predict(test_x) score_index = 0 produce_set = [] for i in test: produce_set.append([]) score_list = [] index_list = [] for j in i.thread: for k in j.sentences: k.predict_score = result[score_index] score_index += 1 score_list.append(k.predict_score) index_list.append(k.index) sorted_index_array = sorted_index(score_list) sen_length = 0 for j in range(len(index_list)): if sen_length < float(len(index_list))*0.3: produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]]) sen_length += 1 else: break score = weightRecall(test,produce_set,write_folder) print score rouge_eval = rouge(test,produce_set) rouge_score = rouge_eval.eval()['rouge_l_f_score'] print rouge_score return score,rouge_score
def rnn_test(self): produce_set = [] for i in self.test: produce_set.append([]) score_list = [] index_list = [] for j in i.thread: input_ins = [] label_ins = [] index = [] for k in j.sentences: input_ins.append(k.feature) index.append(k.index) input_ins = input_ins + input_ins input_ins = numpy.asarray(numpy.float32(input_ins)) softmax_array = self.rnn_model.prob(input_ins) count = 0 for i in softmax_array[(len(softmax_array)/2) :]: score = i#(i[1] * 0.33) + (i[2] * 0.66) + (i[3] * 1) score_list.append(score) index_list.append(index[count]) count += 1 sorted_index_array = sorted_index(score_list) sen_length = 0 for j in range(len(index_list)): if sen_length < float(len(index_list))*0.3: produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]]) sen_length += 1 else: break score = weightRecall(self.test,produce_set) print score rouge_eval = rouge(self.test,produce_set) rouge_score = rouge_eval.eval()['rouge_l_f_score'] print rouge_score return score,rouge_score
def test_mlp(dataset,learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, batch_size=250, n_hidden=100): datasets,length,testSet = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) classifier = MLP( rng=rng, input=x, n_in=length, n_hidden=n_hidden, n_out=2 ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-5 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) prediction_model = theano.function( inputs=[], outputs = classifier.logRegressionLayer.y_pred, givens={ x: test_set_x } ) produceSet = process_test_data(prediction_model(),testSet) print weightRecall(testSet,produceSet) print produceSet
def sgd_optimization_mnist(dataset,learning_rate=0.13, n_epochs=5000,batch_size=250): datasets,length,testSet = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') # data, presented as rasterized images y = T.matrix('y') classifier = logisticRegression(input=x, n_in=length, n_out=1) cost = classifier.mse(y) test_model = theano.function( inputs=[index], outputs=classifier.mse(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) prediction_model = theano.function( inputs=[], outputs = classifier.y_pred, givens={ x: test_set_x } ) validate_model = theano.function( inputs=[index], outputs=classifier.mse(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) updates = [(classifier.W, classifier.W - learning_rate * g_W),(classifier.b, classifier.b - learning_rate * g_b)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) print '... training the model' patience = 7000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index #the number of batches that had trained if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i)for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss test_losses = [test_model(i)for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) ) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))) produceSet = process_test_data(prediction_model(),testSet) print weightRecall(testSet,produceSet) print produceSet