def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression() model.fit(x_train, t_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true)) np.savetxt(output_path_true, t_pred) # Make sure to save predicted probabilities to output_path_true using np.savetxt() # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive)) np.savetxt(output_path_naive, t_pred) # Make sure to save predicted probabilities to output_path_naive using np.savetxt() # Part (f): Apply correction factor using validation set and test on true labels x_val, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True) h_val = model.predict(x_val) alpha = np.mean(h_val[y_val == 1]) py_test = model.predict(x_test) pt_test = py_test / alpha util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_adjusted), correction=alpha) np.savetxt(output_path_adjusted, pt_test)
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train_t = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression() # Fit model on true labels model.fit(x_train, y_train_t) x_val, y_val_t = util.load_dataset(valid_path, label_col='t', add_intercept=True) # Make sure to save predicted probabilities to output_path_true using np.savetxt() np.savetxt(output_path_true, model.predict(x_val)) util.plot(x_val, y_val_t, model.theta, output_path_true[:-4]) # Part (b): Train on y-labels and test on true labels _, y_train_y = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression() # Train model on y-labels model.fit(x_train, y_train_y) # Make sure to save predicted probabilities to output_path_naive using np.savetxt() np.savetxt(output_path_naive, model.predict(x_val)) util.plot(x_val, y_val_t, model.theta, output_path_naive[:-4])
def logregA_varying_regularization(lam, regul1): pa_list = [] ta_list = [] total_ta = 0 total_pa = 0 for i in range(5): Log_ob = LogisticRegression(regLambda=lam, regNorm=regul1) Log_ob.fit(folds_X_complete[i], folds_y_complete[i]) y_test = Log_ob.predict(X_test[i]) pa_score = accuracy_score(y_test, y_complete[i]) pa_list.append(pa_score) y_train = Log_ob.predict(folds_X_complete[i]) ta_score = accuracy_score(y_train, folds_y_complete[i]) ta_list.append(ta_score) total_pa = total_pa + pa_score total_ta = total_ta + ta_score pa = total_pa / 5 ta = total_ta / 5 return pa, ta, pa_list, ta_list
def __init__(self, rng, input, n_in, n_hidden, n_hidden_2, n_out): self.hiddenLayer = HiddenLayer( rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh ) self.hiddenLayer2 = HiddenLayer( rng=rng, input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_hidden_2, activation=T.tanh ) self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer2.output, n_in=n_hidden_2, n_out=n_out ) # L1 norm self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.hiddenLayer2.W).sum() + abs(self.logRegressionLayer.W).sum() ) print 'self.L1={}'.format(self.L1) # square of L2 norm self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.hiddenLayer2.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum() ) print 'self.L2_sqr={}'.format(self.L2_sqr) # Negative log likelihood self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) print 'self.negative_log_likelihood={}'.format(self.negative_log_likelihood) self.errors = self.logRegressionLayer.errors self.params = ( self.hiddenLayer.params + self.hiddenLayer2.params + self.logRegressionLayer.params ) print 'self.params={}'.format(self.params) self.input = input
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # Part (a): x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True) x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True) clf = LogisticRegression() clf.fit(x_train, t_train) util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg') np.savetxt(output_path_true, clf.predict(x_test)) # Part (b): x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_test, y_test = util.load_dataset(test_path, add_intercept=True) x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg') np.savetxt(output_path_naive, clf.predict(x_test)) # Part (f): alpha = np.mean(clf.predict(x_valid[y_valid == 1])) np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha) clf.theta[0] += np.log(2 / alpha - 1) util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
def learn(): stoplist = makeStoplist() features = extractFeaturesFromFile(stoplist=stoplist) vectorizer = TfidfVectorizer(encoding=ENCODING) X_train = vectorizer.fit_transform( [" ".join(feature[1:]) for feature in features]) y_train = np.zeros(len(features)) for i in range(len(features)): if features[i][0] == "+1": y_train[i] = 1 clf = LogisticRegression() clf.fit(X_train, y_train) io.savemat("X_train", {"X_train": X_train}) np.save("y_train", y_train) joblib.dump(vectorizer, "tfidf.vec") clf.save("logreg")
plt.plot(costs_train[i], "--", color=color, label="Train, lambda = {:g}".format(lmbda)) plt.plot(costs_test[i], color=color, label="Test, lambda = {:g}".format(lmbda)) plt.legend(loc="upper right") plt.savefig("results/cost_lmbda.pdf") plt.show() if mode == "logreg": batch_size = 100 n_batches = int(Xtrain.shape[0] / batch_size) logReg = LogisticRegression(n_batches=n_batches, allow_early_stop=False) etas = [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5] acc_list = [] accuracys_train = [] costs_train = [] accuracys_test = [] costs_test = [] for eta in etas: a, b, c, d = logReg.fit(Xtrain, ytrain, eta=eta, n_epochs=2000, Xtest=Xtest,
if __name__ == "__main__": # Load Data filename = 'data/data1.dat' data = loadtxt(filename, delimiter=',') X = data[:, 0:2] y = np.array([data[:, 2]]).T n, d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # train logistic regression logregModel = LogisticRegression(regLambda=0.0001) logregModel.fit(X, y) # Plot the decision boundary h = .02 # step size in the mesh x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()]) print Z # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1, figsize=(4, 3)) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
import time import numpy as np from scipy import io from sklearn.externals import joblib from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from logreg import LogisticRegression if __name__ == "__main__": X_train = io.loadmat("X_train")["X_train"] X_train = X_train.tocsr() #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする) y_train = np.load("y_train.npy") kf = KFold(n_splits=5) start = time.time() for (i, (train, test)) in enumerate(kf.split(X_train), start=1): clf = LogisticRegression() clf.fit(X_train[train], y_train[train]) y_predict = clf.predict(X_train[test]) y_test = y_train[test] print("Fold %d" % i) print("正解率: %f" % accuracy_score(y_test, y_predict)) print("適合率: %f" % precision_score(y_test, y_predict)) print("再現率: %f" % recall_score(y_test, y_predict)) print("F1スコア: %f" % f1_score(y_test, y_predict)) print("") elapsed_time = time.time() - start print(str(elapsed_time) + "[sec]")
if __name__ == '__main__': # Create parser p = Parser() # Create training dataset ds = p.create_dataset("en-ud-train-projective.conllu", train=True) model_file = 'model.pkl' # model_file = 'model_t800.pkl' # Train LR model if os.path.exists(model_file): # if model exists, load from file print("Loading existing model...") lr = pickle.load(open(model_file, 'rb')) else: # train model using minibatch GD lr = LogisticRegression() lr.fit(*ds.to_arrays()) pickle.dump(lr, open(model_file, 'wb')) # Create test dataset test_ds = p.create_dataset("en-ud-dev.conllu") # Copy feature maps to ensure that test datapoints are encoded in the same way test_ds.copy_feature_maps(ds) # Compute move-level accuracy lr.classify_datapoints(*test_ds.to_arrays()) # Compute UAS and sentence-level accuracy t = TreeConstructor(p) t.evaluate(lr, 'en-ud-dev.conllu', ds)
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** def image_path(path): return path[:-3] + "png" # Part (a): Train and test on true labels # Make sure to save predicted probabilities to output_path_true using np.savetxt() x_train, t_train = util.load_dataset(train_path, label_col="t", add_intercept=True) x_test, t_test = util.load_dataset(test_path, label_col="t", add_intercept=True) model = LogisticRegression() model.fit(x_train, t_train) prob_test = model.predict(x_test) np.savetxt(output_path_true, prob_test) util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_true)) # Part (b): Train on y-labels and test on true labels # Make sure to save predicted probabilities to output_path_naive using np.savetxt() x_train, y_train = util.load_dataset(train_path, label_col="y", add_intercept=True) x_test, y_test = util.load_dataset(test_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) prob_test = model.predict(x_test) np.savetxt(output_path_naive, prob_test) util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_naive)) # Part (f): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to output_path_adjusted # Estimate alpha x_val, y_val = util.load_dataset(valid_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) h_val = model.predict(x_val) alpha = np.mean(h_val[y_val == 1]) # Mean over positive y samples. # Adjustment py_test = model.predict(x_test) pt_test = py_test / alpha np.savetxt(output_path_adjusted, pt_test) # Plot util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_adjusted), correction=alpha)
def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out) # end-snippet-2 start-snippet-3 # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = (abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = ((self.hiddenLayer.W**2).sum() + (self.logRegressionLayer.W**2).sum()) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 # keep track of model input self.input = input
index = 27 plt.imshow(train_set_x_orig[index]) plt.show() print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") + "' picture.") ''' # Flatten the images train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T # Normalise image values train_set_x = train_set_x_flatten / 255. test_set_x = test_set_x_flatten / 255. # Create model instance model = LogisticRegression() # Fit model to the data model.fit(train_set_x, train_set_y) # Train the model model.train(2400, verbose=True) # Predict values predictions = model.predict(test_set_x) # Check accuracy model.print_accuracy(predictions, test_set_y) # Plot training loss model.plot_cost()
def main(): print "############# Load Datasets ##############" import stanfordSentimentTreebank as sst skip_unknown_words = bool(args.get("--skip")) shuffle_flag = bool(args.get("--shuffle")) datatype = args.get("--datatype") if datatype == 5: # Fine-grained 5-class n_class = 5 elif datatype == 2: # Binary 2-class n_class = 2 # print "skip_unknown_words",skip_unknown_words vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype) train_set, test_set, dev_set = datasets train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences get,sentence2ids, ids2sentence = funcs # 関数を読み込み scores, sentences = zip(*train_set_sentences) sentences = [[word for word in sentence.lower().split()] for sentence in sentences] vocab_size = len(vocab) dev_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in dev_set]) test_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in test_set]) train_set = [(score, ids) for score,(ids,unknown_word_count) in train_set] test_set = [(score, ids) for score,(ids,unknown_word_count) in test_set] dev_set = [(score, ids) for score,(ids,unknown_word_count) in dev_set] print "train_size : ", len(train_set) print "dev_size : ", len(dev_set) print "test_size : ", len(test_set) print "-"*30 print "vocab_size: ", len(vocab) print "dev_unknown_words : ", dev_unknown_count print "test_unknown_words : ", test_unknown_count print args # EMB_DIM = 50 EMB_DIM = args.get("--emb_size") vocab_size = len(vocab) feat_map_n_1 = args.get("--feat_map_n_1") feat_map_n_final = args.get("--feat_map_n_final") height = 1 width1 = args.get("--width1") width2 = args.get("--width2") k_top = args.get("--k_top") n_class = n_class alpha = args.get("--alpha") n_epoch = args.get("--n_epoch") dropout_rate0 = args.get("--dropout_rate0") dropout_rate1 = args.get("--dropout_rate1") dropout_rate2 = args.get("--dropout_rate2") activation = args.get("--activation") learn = args.get("--learn") number_of_convolutinal_layer = 2 use_regular = bool(args.get("--use_regular")) regular_c = args.get("--regular_c") pretrain = args.get('--pretrain') if pretrain == 'word2vec': print "*Using word2vec" embeddings_W, model = pretrained_embedding.use_word2vec(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM) # -0.5 ~ 0.5で初期化している elif pretrain == 'glove': print "*Using glove" embeddings_W = pretrained_embedding.use_glove(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file='glove_model/glove_50_iter2900.model') else: embeddings_W = np.asarray( rng.normal(0, 0.05, size = (vocab_size, EMB_DIM)), dtype = theano.config.floatX ) embeddings_W[0,:] = 0 print np.amax(embeddings_W) print np.amin(embeddings_W) # print "*embeddings" print embeddings_W # print bool(embeddings) # input_x = [1, 3, 4, 5, 0, 22, 4, 5] print "############# Model Setting ##############" x = T.imatrix('x') length_x = T.iscalar('length_x') y = T.ivector('y') # the sentence sentiment label embeddings = WordEmbeddingLayer(rng=rng, input=x, vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W) def dropout(X, p=0.5): if p > 0: retain_prob = 1 - p X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) # X /= retain_prob return X # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer) # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x) # dynamic_func_test = theano.function( # inputs = [length_x], # outputs = dynamic_func(length_x), # ) # print dynamic_func(len([1,2,3])) l1 = DynamicConvFoldingPoolLayer(rng, input = dropout(embeddings.output, p=dropout_rate0), filter_shape = (feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation = activation ) l1_no_dropout = DynamicConvFoldingPoolLayer(rng, input = embeddings.output, W=l1.W * (1 - dropout_rate0), b=l1.b, filter_shape = (feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation = activation ) l2 = DynamicConvFoldingPoolLayer(rng, input = dropout(l1.output, p=dropout_rate1), filter_shape = (feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation = activation ) l2_no_dropout = DynamicConvFoldingPoolLayer(rng, input = l1_no_dropout.output, W=l2.W * (1 - dropout_rate1), b=l2.b, filter_shape = (feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation = activation ) # l2_output = theano.function( # inputs = [x,length_x], # outputs = l2.output, # # on_unused_input='ignore' # ) # TODO: # check the dimension # input: 1 x 1 x 6 x 4 # out = l2_output( # np.array([input_x], dtype = np.int32), # len(input_x), # ) # test = theano.function( # inputs = [x], # outputs = embeddings.output, # ) # print "--input--" # print np.array([input_x], dtype = np.int32).shape # print "--input embeddings--" # a = np.array([input_x], dtype = np.int32) # print test(a).shape # print "-- output --" # print out # print out.shape # x = T.dscalar("x") # b = T.dscalar("b") # a = 1 # f = theano.function(inputs=[x,b], outputs=b * x + a) # print f(2,2) # expected = (1, feat_map_n, EMB_DIM / 2, k) # assert out.shape == expected, "%r != %r" %(out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# # print "############# LogisticRegressionLayer ##############" l_final = LogisticRegression( rng, input = dropout(l2.output.flatten(2), p=dropout_rate2), n_in = feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out = n_class, # five sentiment level ) l_final_no_dropout = LogisticRegression( rng, input = l2_no_dropout.output.flatten(2), W = l_final.W * (1 - dropout_rate2), b = l_final.b, n_in = feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out = n_class, # five sentiment level ) print "n_in : ", feat_map_n_final * k_top * EMB_DIM # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.)) # p_y_given_x = theano.function( # inputs = [x, length_x], # outputs = l_final.p_y_given_x, # allow_input_downcast=True, # # mode = "DebugMode" # ) # print "p_y_given_x = " # print p_y_given_x( # np.array([input_x], dtype=np.int32), # len(input_x) # ) cost = theano.function( inputs = [x, length_x, y], outputs = l_final.nnl(y), allow_input_downcast=True, # mode = "DebugMode" ) # print "cost:\n", cost( # np.array([input_x], dtype = np.int32), # len(input_x), # np.array([1], dtype = np.int32) # ) print "############# Learning ##############" from sgd import sgd, rmsprop, adagrad, adadelta, adam from regularizer import regularize_l2 layers = [] layers.append(embeddings) layers.append(l1) layers.append(l2) layers.append(l_final) cost = l_final.nnl(y) params = [p for layer in layers for p in layer.params] param_shapes = [l.param_shapes for l in layers] param_grads = [T.grad(cost, param) for param in params] # regularizer setting regularizers = {} regularizers['c'] = regular_c # 2.0, 4.0, 15.0 regularizers['func'] = [None for _ in range(len(params))] if use_regular: regularizers_func = [] regularizers_func.append([regularize_l2(l=0.0001)]) # [embeddings] regularizers_func.append([regularize_l2(l=0.00003), None]) # [W, b] regularizers_func.append([regularize_l2(l=0.000003), None]) # [W, b] regularizers_func.append([regularize_l2(l=0.0001), None]) # [logreg_W, logreg_b] regularizers_func = [r_func for r in regularizers_func for r_func in r] regularizers['func'] = regularizers_func # if third conv layer: 1e-5 print embeddings.params print l1.params print l2.params print l_final.params # updates = sgd(cost, l_final.params) # RegE = 1e-4 # print param_grads if learn == "sgd": updates = sgd(cost, params, lr=0.05) elif learn == "adam": updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) elif learn == "adagrad": updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) elif learn == "adadelta": updates = adadelta(loss_or_grads=cost, params=params, regularizers=regularizers) elif learn == "rmsprop": updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True) # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) predict = theano.function( inputs = [x, length_x], outputs = T.argmax(l_final_no_dropout.p_y_given_x, axis=1), allow_input_downcast=True, # mode = "DebugMode" ) def b(x_data): return np.array(x_data, dtype=np.int32) def test(test_set): # print "############# TEST ##############" y_pred = [] test_set_y = [] # for train_x, train_y in zip(X_data, Y_data): # print test_set # Accuracy_count = 0 for test_y,test_x in test_set: test_x = b([test_x]) p = predict(test_x, len(test_x))[0] y_pred.append(p) test_set_y.append(test_y) # if test_y == p: # Accuracy_count += 1 # print "*predict :",predict(train_x, len(train_x)), train_y # Accuracy = float(Accuracy_count) / len(test_set) # print " accuracy : %f" % Accuracy, return accuracy_score(test_set_y, y_pred) # print classification_report(test_set_y, y_pred) # train_set_rand = np.ndarray(train_set) train_set_rand = train_set[:] train_cost_sum = 0.0 for epoch in xrange(n_epoch): print "== epoch : %d ==" % epoch if shuffle_flag: np.random.shuffle(train_set_rand) # train_set_rand = np.random.permutation(train_set) for i,x_y_set in enumerate(train_set_rand): train_y, train_x = x_y_set train_x = b([train_x]) train_y = b([train_y]) train_cost = train(train_x, len(train_x) , train_y) train_cost_sum += train_cost if i % 1000 == 0 or i == len(train_set)-1: print "i : (%d/%d)" % (i, len(train_set)) , print " (cost : %f )" % train_cost print ' cost :', train_cost_sum print ' train_set : %f' % test(train_set) print ' dev_set : %f' % test(dev_set) print ' test_set : %f' % test(test_set) '''
print out print out.shape expected = (1, feat_map_n, EMB_DIM / 2, k) assert out.shape == expected, "%r != %r" % (out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# print "############# LogisticRegressionLayer ##############" l3 = LogisticRegression( rng, input=l2.output.flatten(2), n_in=feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out=5 # five sentiment level ) print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.)) y = T.ivector('y') # the sentence sentiment label p_y_given_x = theano.function(inputs=[x], outputs=l3.p_y_given_x, mode="DebugMode") print "p_y_given_x = " print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32)) cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")
def __init__(self, x, y, vocab_size, embed_dim, label_n): """ x: theano.tensor.imatrix, (minibatch size, 3) the tree matrix of the minibatch for each row, (node id, left child id, right child id) y: theano.tensor.ivector, (minibatch size,) the labels vocab_size: int vocabulary size, including both the words and phrases embed_dim: int the embedding dimension """ assert x.ndim == 2 assert y.ndim == 1 parent_ids = x[:, 0] children_ids = x[:, 1:] rng = np.random.RandomState(1234) self.embedding = theano.shared( value=rng.normal(0, 0.05, (vocab_size, embed_dim)), name='embedding', borrow=True, ) self.rntn_layer = RNTNLayer(rng, embed_dim) # Update the embedding by # forwarding the embedding from bottom to up # and getting the vector for each node in each tree def update_embedding(child_indices, my_index, embedding): assert child_indices.ndim == 1 assert my_index.ndim == 0 return T.switch( T.eq( child_indices[0], -1 ), # NOTE: not using all() because it's non-differentiable embedding, # if no child, return the word embedding T.set_subtensor( embedding[ my_index], # otherwise, compute the embedding of RNTN layer self.rntn_layer.output(embedding[child_indices[0]], embedding[child_indices[1]]))) final_embedding, updates = theano.scan( fn=update_embedding, sequences=[children_ids, parent_ids], outputs_info=self. embedding, # we should pass the whole matrix and fill in the positions if necessary ) self.update_embedding = theano.function( inputs=[x], updates=[(self.embedding, T.set_subtensor(self.embedding[parent_ids], final_embedding[-1][parent_ids]))]) # the logistic regression layer that predicts the label self.logreg_layer = LogisticRegression( rng, input=final_embedding[-1][parent_ids], n_in=embed_dim, n_out=label_n) cost = self.logreg_layer.nnl(y) params = self.logreg_layer.params + self.rntn_layer.params + [ self.embedding ] self.params = params param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [ (vocab_size, embed_dim) ] grads = [T.grad(cost=cost, wrt=p) for p in params] updates = build_adadelta_updates(params, param_shapes, grads, epsilon=0.1) # TODO: in this step, forward propagation is done again besides the one in `update_embedding` # this extra computation should be avoided self.train = theano.function(inputs=[x, y], updates=updates)
filename = 'data/data2.dat' data = loadtxt(filename, delimiter=',') X = data[:, 0:2] y = np.array([data[:, 2]]).T n, d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # map features into a higher dimensional feature space X = mapFeature(X[:, 0], X[:, 1]) # train logistic regression logregModel = LogisticRegression() logregModel.fit(X, y) # reload the data for 2D plotting purposes data = loadtxt(filename, delimiter=',') PX = data[:, 0:2] y = data[:, 2] # Standardize the data mean = PX.mean(axis=0) std = PX.std(axis=0) PX = (PX - mean) / std # Plot the decision boundary h = .02 # step size in the mesh x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
def main(train_path, validation_path, save_path): """Problem 2: Logistic regression for imbalanced labels. Run under the following conditions: 1. naive logistic regression 2. upsampling minority class Args: train_path: Path to CSV file containing training set. validation_path: Path to CSV file containing validation set. save_path: Path to save predictions. """ output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_upsampling = save_path.replace(WILDCARD, 'upsampling') # *** START CODE HERE *** # Part (b): Vanilla logistic regression # Make sure to save predicted probabilities to output_path_naive using np.savetxt() print("Vanilla Logistic Regression:") x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_val, y_val = util.load_dataset(validation_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) y_predict = clf.predict(x_val) np.savetxt(output_path_naive, y_predict) y_predict = y_predict >= 0.5 util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4]) accuracy = np.mean(y_predict == y_val) A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0) A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1) balanced_accuracy = 0.5 * (A_0 + A_1) print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {}," "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1, balanced_accuracy)) #plot the real expected outcome from the validation: util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation") # Part (d): Upsampling minority class # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt() # Repeat minority examples 1 / kappa times num_add = int(1 / kappa) - 1 x_train = np.concatenate( (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)), axis=0) y_train = np.concatenate( (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0) x_val, y_val = util.load_dataset(validation_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) y_predict = clf.predict(x_val) np.savetxt(output_path_upsampling, y_predict) y_predict = y_predict >= 0.5 util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4]) accuracy = np.mean(y_predict == y_val) A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0) A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1) balanced_accuracy = 0.5 * (A_0 + A_1) print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {}," "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1, balanced_accuracy)) #plot the real expected outcome from the validation: util.plot(x_val, y_val, clf.theta, output_path_upsampling[:-4] + "validation")
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model_true = LogisticRegression() model_true.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True) util.plot(x_test, y_test, model_true.theta, 'plot_5a.png') # Make sure to save predicted probabilities to output_path_true using np.savetxt() np.savetxt(output_path_true, model_true.predict(x_test)) # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model_naive = LogisticRegression() model_naive.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png') # Make sure to save predicted probabilities to output_path_naive using np.savetxt() np.savetxt(output_path_naive, model_naive.predict(x_test)) # Part (f): Apply correction factor using validation set and test on true labels x_valid, y_valid = util.load_dataset(valid_path, label_col='t', add_intercept=True) x_index = np.where(y_valid == 1) alpha = 1 / len(y_valid[y_valid == 1]) * np.sum( model_naive.predict((x_valid[x_index]))) x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) util.plot(x_test, y_test, model_naive.theta, 'plot_5f.png', correction=alpha) np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
#coding:utf-8 from sklearn.externals import joblib from logreg import LogisticRegression ENCODING = "cp1252" if __name__ == "__main__": vectorizer = joblib.load("tfidf.vec") clf = LogisticRegression("logreg") terms = vectorizer.get_feature_names() index_list = list(range(len(terms))) index_list.sort(key=lambda i: clf.coef_[i]) print("top 10") for i in index_list[:-11:-1]: print(terms[i], clf.coef_[i]) print("") print("worst 10") for i in index_list[:10]: print(terms[i], clf.coef_[i])
layer2 = ConvFoldingPoolLayer(rng=rng, input=layer1.output, filter_shape=filter_shape, k=k, fold=1, W=theano.shared(value=W, name="W"), b=theano.shared(value=b, name="b")) n_in = filter_shape[0] * k * embed_dm / 2 n_out = 5 W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX) b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX) layer3 = LogisticRegression(rng=rng, input=layer2.output.flatten(2), n_in=n_in, n_out=n_out, W=theano.shared(value=W_logreg, name="W_logreg"), b=theano.shared(value=b_logreg, name="b_logreg")) f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol)) f2 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.errors(y_symbol)) f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x) f_el = theano.function(inputs=[x_symbol], outputs=layer1.output) f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output) #########################
def train_and_test(args, print_config): assert args.conv_layer_n == len(args.filter_widths) == len( args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len( args.ks) # \mod{dim, 2^{\sum fold_flags}} == 0 assert args.embed_dm % (2**sum(args.fold_flags)) == 0 ################### # get the data # ################### datasets = load_data(args.corpus_path) train_set_x, train_set_y = datasets[0] dev_set_x, dev_set_y = datasets[1] test_set_x, test_set_y = datasets[2] word2index = datasets[3] index2word = datasets[4] pretrained_embeddings = datasets[5] n_train_batches = train_set_x.get_value( borrow=True).shape[0] / args.batch_size n_dev_batches = dev_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size n_test_batches = test_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size train_sent_len = train_set_x.get_value(borrow=True).shape[1] possible_labels = set(train_set_y.get_value().tolist()) if args.use_pretrained_embedding: args.embed_dm = pretrained_embeddings.get_value().shape[1] ################################### # Symbolic variable definition # ################################### x = T.imatrix('x') # the word indices matrix y = T.ivector('y') # the sentiment labels batch_index = T.iscalar('batch_index') rng = np.random.RandomState(1234) ############################### # Construction of the network # ############################### # Layer 1, the embedding layer layer1 = WordEmbeddingLayer( rng, input=x, vocab_size=len(word2index), embed_dm=args.embed_dm, embeddings=(pretrained_embeddings if args.use_pretrained_embedding else None)) dropout_layers = [layer1] layers = [layer1] for i in range(args.conv_layer_n): fold_flag = args.fold_flags[i] # for the dropout layer dpl = DropoutLayer(input=dropout_layers[-1].output, rng=rng, dropout_rate=args.dropout_rates[0]) next_layer_dropout_input = dpl.output next_layer_input = layers[-1].output # for the conv layer filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]), 1, args.filter_widths[i]) k = args.ks[i] print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % ( args.conv_activation_unit, i + 2, filter_shape, k, args.dropout_rates[i], args.norm_w, fold_flag) # we have two layers adding to two paths repsectively, # one for training # the other for prediction(averaged model) dropout_conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_dropout_input, filter_shape=filter_shape, k=k, norm_w=args.norm_w, fold=fold_flag, activation=args.conv_activation_unit) # for prediction # sharing weight with dropout layer conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_input, filter_shape=filter_shape, k=k, activation=args.conv_activation_unit, fold=fold_flag, W=dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging b=dropout_conv_layer.b) dropout_layers.append(dropout_conv_layer) layers.append(conv_layer) # last, the output layer # both dropout and without dropout if sum(args.fold_flags) > 0: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum( args.fold_flags)) else: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm print "For output layer, n_in = %d, dropout_rate = %f" % ( n_in, args.dropout_rates[-1]) dropout_output_layer = LogisticRegression( rng, input=dropout_layers[-1].output.flatten(2), n_in=n_in, # divided by 2x(how many times are folded) n_out=len(possible_labels) # five sentiment level ) output_layer = LogisticRegression( rng, input=layers[-1].output.flatten(2), n_in=n_in, n_out=len(possible_labels), W=dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget b=dropout_output_layer.b) dropout_layers.append(dropout_output_layer) layers.append(output_layer) ############################### # Error and cost # ############################### # cost and error come from different model! dropout_cost = dropout_output_layer.nnl(y) errors = output_layer.errors(y) def prepare_L2_sqr(param_layers, L2_regs): assert len(L2_regs) == len(param_layers) return T.sum([ L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum() for L2_reg, layer in zip(L2_regs, param_layers) ]) L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs) L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:]) if args.use_L2_reg: cost = dropout_cost + L2_sqr cost_no_ebd = dropout_cost + L2_sqr_no_ebd else: cost = dropout_cost cost_no_ebd = dropout_cost ############################### # Parameters to be used # ############################### print "Delay embedding learning by %d epochs" % ( args.embedding_learning_delay_epochs) print "param_layers: %r" % dropout_layers param_layers = dropout_layers ############################## # Parameter Update # ############################## print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho, args.epsilon) params = [param for layer in param_layers for param in layer.params] param_shapes = [ param for layer in param_layers for param in layer.param_shapes ] param_grads = [T.grad(cost, param) for param in params] # AdaDelta parameter update # E[g^2] # initialized to zero egs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Eg:" + param.name) for param_shape, param in zip(param_shapes, params) ] # E[\delta x^2], initialized to zero exs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Ex:" + param.name) for param_shape, param in zip(param_shapes, params) ] new_egs = [ args.rho * eg + (1 - args.rho) * g**2 for eg, g in zip(egs, param_grads) ] delta_x = [ -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g for new_eg, ex, g in zip(new_egs, exs, param_grads) ] new_exs = [ args.rho * ex + (1 - args.rho) * (dx**2) for ex, dx in zip(exs, delta_x) ] egs_updates = zip(egs, new_egs) exs_updates = zip(exs, new_exs) param_updates = [(p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)] updates = egs_updates + exs_updates + param_updates # updates WITHOUT embedding # exclude the embedding parameter egs_updates_no_ebd = zip(egs[1:], new_egs[1:]) exs_updates_no_ebd = zip(exs[1:], new_exs[1:]) param_updates_no_ebd = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:] ] updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd def make_train_func(cost, updates): return theano.function( inputs=[batch_index], outputs=[cost], updates=updates, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb) train_model = make_train_func(cost, updates) def make_error_func(x_val, y_val): return theano.function( inputs=[], outputs=errors, givens={ x: x_val, y: y_val }, ) dev_error = make_error_func(dev_set_x, dev_set_y) test_error = make_error_func(test_set_x, test_set_y) ############################# # Debugging purpose code # ############################# # : PARAMETER TUNING NOTE: # some demonstration of the gradient vanishing probelm train_data_at_index = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], } train_data_at_index_with_y = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] } if print_config["nnl"]: get_nnl = theano.function( inputs=[batch_index], outputs=dropout_cost, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) if print_config["L2_sqr"]: get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr) get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd) if print_config["grad_abs_mean"]: print_grads = theano.function( inputs=[], outputs=[ theano.printing.Print(param.name)(T.mean(T.abs_(param_grad))) for param, param_grad in zip(params, param_grads) ], givens={ x: train_set_x, y: train_set_y }) activations = [l.output for l in dropout_layers[1:-1]] weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]] if print_config["activation_hist"]: # turn into 1D array get_activations = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in activations], givens=train_data_at_index) if print_config["weight_grad_hist"]: # turn into 1D array get_weight_grads = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in weight_grads], givens=train_data_at_index_with_y) if print_config["activation_tracking"]: # get the mean and variance of activations for each conv layer get_activation_mean = theano.function( inputs=[batch_index], outputs=[T.mean(val) for val in activations], givens=train_data_at_index) get_activation_std = theano.function( inputs=[batch_index], outputs=[T.std(val) for val in activations], givens=train_data_at_index) if print_config["weight_grad_tracking"]: # get the mean and variance of activations for each conv layer get_weight_grad_mean = theano.function( inputs=[batch_index], outputs=[T.mean(g) for g in weight_grads], givens=train_data_at_index_with_y) get_weight_grad_std = theano.function( inputs=[batch_index], outputs=[T.std(g) for g in weight_grads], givens=train_data_at_index_with_y) #the training loop patience = args.patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 start_time = time.clock() done_looping = False epoch = 0 nnls = [] L2_sqrs = [] activation_means = [[] for i in range(args.conv_layer_n)] activation_stds = [[] for i in range(args.conv_layer_n)] weight_grad_means = [[] for i in range(args.conv_layer_n)] weight_grad_stds = [[] for i in range(args.conv_layer_n)] activation_hist_data = [[] for i in range(args.conv_layer_n)] weight_grad_hist_data = [[] for i in range(args.conv_layer_n)] train_errors = [] dev_errors = [] try: print "validation_frequency = %d" % validation_frequency while (epoch < args.n_epochs): epoch += 1 print "At epoch {0}".format(epoch) if epoch == (args.embedding_learning_delay_epochs + 1): print "########################" print "Start training embedding" print "########################" # shuffle the training data train_set_x_data = train_set_x.get_value(borrow=True) train_set_y_data = train_set_y.get_value(borrow=True) permutation = np.random.permutation( train_set_x.get_value(borrow=True).shape[0]) train_set_x.set_value(train_set_x_data[permutation]) train_set_y.set_value(train_set_y_data[permutation]) for minibatch_index in range(n_train_batches): if epoch >= (args.embedding_learning_delay_epochs + 1): train_cost = train_model(minibatch_index) else: train_cost = train_model_no_ebd(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # train_error_val = np.mean([train_error(i) # for i in range(n_train_batches)]) dev_error_val = dev_error() # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %( # epoch, # minibatch_index, # train_error_val * 100, # dev_error_val * 100 # ) print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % ( epoch, minibatch_index, dev_error_val * 100) # train_errors.append(train_error_val) dev_errors.append(dev_error_val) if dev_error_val < best_validation_loss: best_iter = iter #improve patience if loss improvement is good enough if dev_error_val < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = dev_error_val test_error_val = test_error() print((' epoch %i, minibatch %i/%i, test error of' ' best dev error %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_error_val * 100.)) print "Dumping model to %s" % (args.model_path) dump_params(params, args.model_path) if (minibatch_index + 1) % 50 == 0 or minibatch_index == n_train_batches - 1: print "%d / %d minibatches completed" % ( minibatch_index + 1, n_train_batches) if print_config["nnl"]: print "`nnl` for the past 50 minibatches is %f" % ( np.mean(np.array(nnls))) nnls = [] if print_config["L2_sqr"]: print "`L2_sqr`` for the past 50 minibatches is %f" % ( np.mean(np.array(L2_sqrs))) L2_sqrs = [] ################## # Plotting stuff # ################## if print_config["nnl"]: nnl = get_nnl(minibatch_index) # print "nll for batch %d: %f" %(minibatch_index, nnl) nnls.append(nnl) if print_config["L2_sqr"]: if epoch >= (args.embedding_learning_delay_epochs + 1): L2_sqrs.append(get_L2_sqr()) else: L2_sqrs.append(get_L2_sqr_no_ebd()) if print_config["activation_tracking"]: layer_means = get_activation_mean(minibatch_index) layer_stds = get_activation_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( activation_means, activation_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["weight_grad_tracking"]: layer_means = get_weight_grad_mean(minibatch_index) layer_stds = get_weight_grad_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( weight_grad_means, weight_grad_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["activation_hist"]: for layer_hist, layer_data in zip( activation_hist_data, get_activations(minibatch_index)): layer_hist += layer_data.tolist() if print_config["weight_grad_hist"]: for layer_hist, layer_data in zip( weight_grad_hist_data, get_weight_grads(minibatch_index)): layer_hist += layer_data.tolist() except: import traceback traceback.print_exc(file=sys.stdout) finally: from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt) if print_config["activation_tracking"]: plot_track(activation_means, activation_stds, "activation_tracking") if print_config["weight_grad_tracking"]: plot_track(weight_grad_means, weight_grad_stds, "weight_grad_tracking") if print_config["activation_hist"]: plot_hist(activation_hist_data, "activation_hist") if print_config["weight_grad_hist"]: plot_hist(weight_grad_hist_data, "weight_grad_hist") if print_config["error_vs_epoch"]: train_errors = [0] * len(dev_errors) ax = plot_error_vs_epoch( train_errors, dev_errors, title=('Best dev score: %f %% ' ' at iter %i with test error %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) if not args.task_signature: plt.show() else: plt.savefig("plots/" + args.task_signature + ".png") end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) # save the result with open(args.output, "a") as f: f.write("%s\t%f\t%f\n" % (args.task_signature, best_validation_loss, test_error_val)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
filename = 'data/data2.dat' data = loadtxt(filename, delimiter=',') X = data[:, 0:2] y = np.array([data[:, 2]]).T n, d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # map features into a higher dimensional feature space X = mapFeature(X[:, 0], X[:, 1]) # train logistic regression logregModel = LogisticRegression(regLambda=10) logregModel.fit(X, y) # reload the data for 2D plotting purposes data = loadtxt(filename, delimiter=',') PX = data[:, 0:2] y = data[:, 2] # Standardize the data mean = PX.mean(axis=0) std = PX.std(axis=0) PX = (PX - mean) / std # Plot the decision boundary h = .02 # step size in the mesh x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
if args.test: test_file = args.test test = pd.read_csv(test_file) if test_file is None: print("Splitting train to accomodate for test set.") train, test = train_test_split(train, test_size=0.2) train_Y = train['labels'].values train_X = train.drop(['labels'], axis=1).values test_Y = test['labels'].values test_X = test.drop(['labels'], axis=1).values print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape) logreg = LogisticRegression(learning_rate=lr, epochs=epochs, initialiser=init, verbose=verbose) logreg.fit(train_X, train_Y) predictions = logreg.predict(test_X) if args.output == ".": args.output = os.getcwd() with open(args.output + "/classification_report.txt", 'w') as f: f.write(str(classification_report(test_Y, predictions))) test['predictions'] = predictions test.to_csv(args.output + "/predictions.csv")
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='t') x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='t') from logreg import LogisticRegression clf = LogisticRegression() clf.fit(x_train, y_train) print(clf.theta) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plot_decision_line(clf.theta, x_valid, ax) plt.savefig("posonly_all_observed.png") plt.show() # Make sure to save predicted probabilities to output_path_true using np.savetxt() # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='y') x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='y') from logreg import LogisticRegression clf = LogisticRegression() clf.fit(x_train, y_train) print(clf.theta) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plot_decision_line(clf.theta, x_valid, ax) plt.savefig("naive_training_partial.png") plt.show() # Make sure to save predicted probabilities to output_path_naive using np.savetxt() # Part (f): Apply correction factor using validation set and test on true labels clf = LogisticRegression() clf.fit(x_train, y_train) #decition y_pred = clf.predict(x_valid) print(y_pred) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plt.show()