def logregA_varying_regularization(lam, regul1): pa_list = [] ta_list = [] total_ta = 0 total_pa = 0 for i in range(5): Log_ob = LogisticRegression(regLambda=lam, regNorm=regul1) Log_ob.fit(folds_X_complete[i], folds_y_complete[i]) y_test = Log_ob.predict(X_test[i]) pa_score = accuracy_score(y_test, y_complete[i]) pa_list.append(pa_score) y_train = Log_ob.predict(folds_X_complete[i]) ta_score = accuracy_score(y_train, folds_y_complete[i]) ta_list.append(ta_score) total_pa = total_pa + pa_score total_ta = total_ta + ta_score pa = total_pa / 5 ta = total_ta / 5 return pa, ta, pa_list, ta_list
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression() model.fit(x_train, t_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true)) np.savetxt(output_path_true, t_pred) # Make sure to save predicted probabilities to output_path_true using np.savetxt() # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive)) np.savetxt(output_path_naive, t_pred) # Make sure to save predicted probabilities to output_path_naive using np.savetxt() # Part (f): Apply correction factor using validation set and test on true labels x_val, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True) h_val = model.predict(x_val) alpha = np.mean(h_val[y_val == 1]) py_test = model.predict(x_test) pt_test = py_test / alpha util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_adjusted), correction=alpha) np.savetxt(output_path_adjusted, pt_test)
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train_t = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression() # Fit model on true labels model.fit(x_train, y_train_t) x_val, y_val_t = util.load_dataset(valid_path, label_col='t', add_intercept=True) # Make sure to save predicted probabilities to output_path_true using np.savetxt() np.savetxt(output_path_true, model.predict(x_val)) util.plot(x_val, y_val_t, model.theta, output_path_true[:-4]) # Part (b): Train on y-labels and test on true labels _, y_train_y = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression() # Train model on y-labels model.fit(x_train, y_train_y) # Make sure to save predicted probabilities to output_path_naive using np.savetxt() np.savetxt(output_path_naive, model.predict(x_val)) util.plot(x_val, y_val_t, model.theta, output_path_naive[:-4])
def learn(): stoplist = makeStoplist() features = extractFeaturesFromFile(stoplist=stoplist) vectorizer = TfidfVectorizer(encoding=ENCODING) X_train = vectorizer.fit_transform( [" ".join(feature[1:]) for feature in features]) y_train = np.zeros(len(features)) for i in range(len(features)): if features[i][0] == "+1": y_train[i] = 1 clf = LogisticRegression() clf.fit(X_train, y_train) io.savemat("X_train", {"X_train": X_train}) np.save("y_train", y_train) joblib.dump(vectorizer, "tfidf.vec") clf.save("logreg")
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # Part (a): x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True) x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True) clf = LogisticRegression() clf.fit(x_train, t_train) util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg') np.savetxt(output_path_true, clf.predict(x_test)) # Part (b): x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_test, y_test = util.load_dataset(test_path, add_intercept=True) x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg') np.savetxt(output_path_naive, clf.predict(x_test)) # Part (f): alpha = np.mean(clf.predict(x_valid[y_valid == 1])) np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha) clf.theta[0] += np.log(2 / alpha - 1) util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
if __name__ == '__main__': # Create parser p = Parser() # Create training dataset ds = p.create_dataset("en-ud-train-projective.conllu", train=True) model_file = 'model.pkl' # model_file = 'model_t800.pkl' # Train LR model if os.path.exists(model_file): # if model exists, load from file print("Loading existing model...") lr = pickle.load(open(model_file, 'rb')) else: # train model using minibatch GD lr = LogisticRegression() lr.fit(*ds.to_arrays()) pickle.dump(lr, open(model_file, 'wb')) # Create test dataset test_ds = p.create_dataset("en-ud-dev.conllu") # Copy feature maps to ensure that test datapoints are encoded in the same way test_ds.copy_feature_maps(ds) # Compute move-level accuracy lr.classify_datapoints(*test_ds.to_arrays()) # Compute UAS and sentence-level accuracy t = TreeConstructor(p) t.evaluate(lr, 'en-ud-dev.conllu', ds)
data = loadtxt(filename, delimiter=',') X = data[:, 0:2] y = np.array([data[:, 2]]).T n,d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # map features into a higher dimensional feature space X = mapFeature(X[:,0],X[:,1]) # train logistic regression logregModel = LogisticRegression() logregModel.fit(X,y) # reload the data for 2D plotting purposes data = loadtxt(filename, delimiter=',') PX = data[:, 0:2] y = data[:, 2] # Standardize the data mean = PX.mean(axis=0) std = PX.std(axis=0) PX = (PX - mean) / std # Plot the decision boundary h = .02 # step size in the mesh x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
def main(train_path, validation_path, save_path): """Problem 2: Logistic regression for imbalanced labels. Run under the following conditions: 1. naive logistic regression 2. upsampling minority class Args: train_path: Path to CSV file containing training set. validation_path: Path to CSV file containing validation set. save_path: Path to save predictions. """ output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_upsampling = save_path.replace(WILDCARD, 'upsampling') # *** START CODE HERE *** # Part (b): Vanilla logistic regression # Make sure to save predicted probabilities to output_path_naive using np.savetxt() print("Vanilla Logistic Regression:") x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_val, y_val = util.load_dataset(validation_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) y_predict = clf.predict(x_val) np.savetxt(output_path_naive, y_predict) y_predict = y_predict >= 0.5 util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4]) accuracy = np.mean(y_predict == y_val) A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0) A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1) balanced_accuracy = 0.5 * (A_0 + A_1) print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {}," "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1, balanced_accuracy)) #plot the real expected outcome from the validation: util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation") # Part (d): Upsampling minority class # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt() # Repeat minority examples 1 / kappa times num_add = int(1 / kappa) - 1 x_train = np.concatenate( (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)), axis=0) y_train = np.concatenate( (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0) x_val, y_val = util.load_dataset(validation_path, add_intercept=True) clf = LogisticRegression() clf.fit(x_train, y_train) y_predict = clf.predict(x_val) np.savetxt(output_path_upsampling, y_predict) y_predict = y_predict >= 0.5 util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4]) accuracy = np.mean(y_predict == y_val) A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0) A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1) balanced_accuracy = 0.5 * (A_0 + A_1) print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {}," "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1, balanced_accuracy)) #plot the real expected outcome from the validation: util.plot(x_val, y_val, clf.theta, output_path_upsampling[:-4] + "validation")
def evaluatePerformance(numTrials=1000): ''' Evaluate the performance of decision trees and logistic regression, average over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ** Note that your implementation must follow this API** ''' # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array([data[:, 0]]).T n,d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std #1000 trials num_folds = 10 percent_incs = 10 tree_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs]) log_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs]) #split the data k_fold = sklearn.cross_validation.KFold(len(y), n_folds=num_folds) for i in xrange(numTrials): #for each trial, shuffle the data #print 'Iteration: ', i+1 idx = np.arange(n) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] j = 0 for train_index, test_index in k_fold: for k in xrange(percent_incs): #get the data splits for the current fold Xtrain, Xtest = X[train_index[0:(n/percent_incs)*(k+1)]], X[test_index] ytrain, ytest = y[train_index[0:(n/percent_incs)*(k+1)]], y[test_index] # train the decision tree clf = tree.DecisionTreeClassifier() clf = clf.fit(Xtrain, ytrain) # output tree predictions on the remaining data and check them tree_pred = clf.predict(Xtest) tree_accuracy[i*num_folds + j,k] = accuracy_score(ytest, tree_pred) #train logarithmic regression logregModel = LogisticRegression(alpha = 0.1, epsilon = 0.005) logregModel.fit(Xtrain, ytrain) #output logreg predictions on the remaining data and check them log_pred = logregModel.predict(Xtest) log_accuracy[i*num_folds + j,k] = accuracy_score(ytest, log_pred) j += 1 # compute the training accuracy of the model meanDecisionTreeAccuracy = np.mean(tree_accuracy[:,percent_incs-1]) # TODO: update these statistics based on the results of your experiment stddevDecisionTreeAccuracy = np.std(tree_accuracy[:,percent_incs-1]) meanLogisticRegressionAccuracy = np.mean(log_accuracy[:,percent_incs-1]) stddevLogisticRegressionAccuracy = np.std(log_accuracy[:,percent_incs-1]) #print graph tree_array = np.zeros(percent_incs) tree_array_std = np.zeros(percent_incs) log_array = np.zeros(percent_incs) log_array_std = np.zeros(percent_incs) for i in xrange(percent_incs): tree_array[i] = np.mean(tree_accuracy[:,i]) tree_array_std[i] = np.std(tree_accuracy[:,i]) log_array[i] = np.mean(log_accuracy[:,i]) log_array_std[i] = np.std(log_accuracy[:,i]) x_axis = (np.arange(percent_incs) + 1) * 10 tree_plot = plt.errorbar(x=x_axis, y=tree_array, yerr=tree_array_std) log_plot = plt.errorbar(x=x_axis, y=log_array, yerr=log_array_std) plt.xlabel('Training Data Used (percentage)') plt.ylabel('Accuracy (mean)') plt.title('Learning Curve') plt.axis([10, 100, 0.0, 1.0]) plt.grid(True) plt.legend([tree_plot, log_plot], ["Decision Tree", "Logistic Regression"], loc=4) plt.savefig('learningcurve.pdf') #plt.show() # make certain that the return value matches the API specification stats = np.zeros((2,2)) stats[0,0] = meanDecisionTreeAccuracy stats[0,1] = stddevDecisionTreeAccuracy stats[1,0] = meanLogisticRegressionAccuracy stats[1,1] = stddevLogisticRegressionAccuracy return stats
import time import numpy as np from scipy import io from sklearn.externals import joblib from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from logreg import LogisticRegression if __name__ == "__main__": X_train = io.loadmat("X_train")["X_train"] X_train = X_train.tocsr() #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする) y_train = np.load("y_train.npy") kf = KFold(n_splits=5) start = time.time() for (i, (train, test)) in enumerate(kf.split(X_train), start=1): clf = LogisticRegression() clf.fit(X_train[train], y_train[train]) y_predict = clf.predict(X_train[test]) y_test = y_train[test] print("Fold %d" % i) print("正解率: %f" % accuracy_score(y_test, y_predict)) print("適合率: %f" % precision_score(y_test, y_predict)) print("再現率: %f" % recall_score(y_test, y_predict)) print("F1スコア: %f" % f1_score(y_test, y_predict)) print("") elapsed_time = time.time() - start print(str(elapsed_time) + "[sec]")
if __name__ == "__main__": # Load Data filename = 'data/data1.dat' data = loadtxt(filename, delimiter=',') X = data[:, 0:2] y = np.array([data[:, 2]]).T n, d = X.shape # Standardize the data mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # train logistic regression logregModel = LogisticRegression(regLambda=0.0001) logregModel.fit(X, y) # Plot the decision boundary h = .02 # step size in the mesh x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()]) print Z # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1, figsize=(4, 3)) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
def evaluatePerformance(numTrials = 1000): ''' Evaluate the performance of decision trees and logistic regression, average over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ** Note that your implementation must follow this API** ''' # Xtrain = X[1:101,:] # train on first 100 instances # Xtest = X[101:,:] # ytrain = y[1:101,:] # test on remaining instances # ytest = y[101:,:] # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array([data[:, 0]]).T n,d = X.shape # shuffle the data idx = np.arange(n) np.random.seed(13) # number of folds k = 10 # creates an array of numbers that correspond to the start / end points of each fold in the case for hw from 0 -266 it should return 0 26 ...267 fold_index = n/k index_arrayX = [i*fold_index for i in range(k)] index_arrayX = np.append(index_arrayX,n) index_arrayY = [i*fold_index for i in range(k)] index_arrayY = np.append(index_arrayX,n) stddevLogisticRegressionAccuracy = 0 meanDecisionTreeAccuracy = 0 meanLogisticRegressionAccuracy = 0 stddevDecisionTreeAccuracy = 0 # an array to store all of the learning accuracies where the #rows = k*numTrial and # columns is each percentage of the data log_learning = np.matrix(np.zeros((numTrials*k,9))) tree_learning = np.matrix(np.zeros((numTrials*k,9))) #index for learning ll =0 #accuracy vars log_a = 0 tree_a =0 # making decision tree object and a logistic regression object clf = tree.DecisionTreeClassifier() lr = LogisticRegression(alpha = 0.0000001, regLambda=0.001, epsilon=0.0001, maxNumIters = 10000) #test_instance = 1 #start_time = time.time() # ~~~~~~~~~~~main loop ~~~~~~~~~~~~~~~~~ for i in xrange (numTrials): #shuffle data after each cross validation np.random.shuffle(idx) X = X[idx] y = y[idx] for j in xrange(k): # seperate test data from train data, moves test data to subsequent fold after each loop #print (time.time() - start_time) end = j+1 Xtest = X[index_arrayX[j]:index_arrayX[end],:] ytest = y[index_arrayY[j]:index_arrayX[end],:] Xtrain = X[0:index_arrayX[j],:] ytrain = y[0:index_arrayY[j],:] Xtrain = np.append(Xtrain, X[index_arrayX[j+1]:n,:],axis =0) ytrain = np.append(ytrain, y[index_arrayY[j+1]:n,:],axis =0) size_n,size_d = Xtrain.shape #size of 10% blocks train_percentage = size_n/10 for l in xrange(1,10): #train / find accuracy over 10% then 20% ect until loop exits clf = clf.fit(Xtrain[0:train_percentage*l,:],ytrain[0:train_percentage*l,:]) treey_pred = clf.predict(Xtest[0:train_percentage*l,:]) lr.fit(Xtrain[0:train_percentage*l,:], ytrain[0:train_percentage*l,:]) logy_pred = lr.predict(Xtest[0:train_percentage*l,:]) # fill in accuracies into accuracy matrix log_a = accuracy_score(ytest[0:train_percentage*l,:],logy_pred) + log_a tree_a = accuracy_score(ytest[0:train_percentage*l,:],treey_pred) + tree_a log_learning[ll,(l-1)] = log_a tree_learning[ll,(l-1)] = tree_a ll+1 tree_acc = 0 log_acc = 0 for o in xrange(9): #summing the accuracies for each percentage then dviding by fold*trials * percentages meanDecisionTreeAccuracy = (np.sum(tree_learning[:,o])/(9*k*numTrials)) + meanDecisionTreeAccuracy meanLogisticRegressionAccuracy = (np.sum(log_learning[:,o])/(9*k*numTrials)) + meanLogisticRegressionAccuracy #finding total mean accuracy over all percentages as well as standard deviations over (k*numTrial) trials meanDecisionTreeAccuracy = meanDecisionTreeAccuracy/(9) meanLogisticRegressionAccuracy = meanLogisticRegressionAccuracy /(9) stddevDecisionTreeAccuracy = np.std(tree_learning)/(k*numTrials) stddevLogisticRegressionAccuracy = np.std(log_learning)/(k*numTrials) # make certain that the return value matches the API specification stats = np.zeros((2,2)) stats[0,0] = meanDecisionTreeAccuracy stats[0,1] = stddevDecisionTreeAccuracy stats[1,0] = meanLogisticRegressionAccuracy stats[1,1] = stddevLogisticRegressionAccuracy #end_time = time.time() plot_log= np.array(np.zeros((9,1))) plot_tree =np.array(np.zeros((9,1))) #putting the mean accuracies for each perctage block into an array for q in xrange(9): plot_log[q] = np.sum(log_learning[:,q])/(9*k*numTrials) plot_tree[q] = np.sum(tree_learning[:,q])/(9*k*numTrials) percent_array = [10,20,30,40,50,60,70,80,90] plt.figure(1) plt.clf() plt.title("Learning Curve") plt.xlabel("Percentage") plt.ylabel("Accuracy") plt.axis([0,100, .6,.8]) plt.plot(percent_array,plot_log, 'rx', label='Logistic Regression') plt.hold plt.plot(percent_array,plot_tree, 'bx',label ='Decision Tree') plt.legend(loc='lower right') plt.savefig('learningcurve.png') #plt.show() return stats
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model_true = LogisticRegression() model_true.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True) util.plot(x_test, y_test, model_true.theta, 'plot_5a.png') # Make sure to save predicted probabilities to output_path_true using np.savetxt() np.savetxt(output_path_true, model_true.predict(x_test)) # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model_naive = LogisticRegression() model_naive.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png') # Make sure to save predicted probabilities to output_path_naive using np.savetxt() np.savetxt(output_path_naive, model_naive.predict(x_test)) # Part (f): Apply correction factor using validation set and test on true labels x_valid, y_valid = util.load_dataset(valid_path, label_col='t', add_intercept=True) x_index = np.where(y_valid == 1) alpha = 1 / len(y_valid[y_valid == 1]) * np.sum( model_naive.predict((x_valid[x_index]))) x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) util.plot(x_test, y_test, model_naive.theta, 'plot_5f.png', correction=alpha) np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** def image_path(path): return path[:-3] + "png" # Part (a): Train and test on true labels # Make sure to save predicted probabilities to output_path_true using np.savetxt() x_train, t_train = util.load_dataset(train_path, label_col="t", add_intercept=True) x_test, t_test = util.load_dataset(test_path, label_col="t", add_intercept=True) model = LogisticRegression() model.fit(x_train, t_train) prob_test = model.predict(x_test) np.savetxt(output_path_true, prob_test) util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_true)) # Part (b): Train on y-labels and test on true labels # Make sure to save predicted probabilities to output_path_naive using np.savetxt() x_train, y_train = util.load_dataset(train_path, label_col="y", add_intercept=True) x_test, y_test = util.load_dataset(test_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) prob_test = model.predict(x_test) np.savetxt(output_path_naive, prob_test) util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_naive)) # Part (f): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to output_path_adjusted # Estimate alpha x_val, y_val = util.load_dataset(valid_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) h_val = model.predict(x_val) alpha = np.mean(h_val[y_val == 1]) # Mean over positive y samples. # Adjustment py_test = model.predict(x_test) pt_test = py_test / alpha np.savetxt(output_path_adjusted, pt_test) # Plot util.plot(x_test, t_test, model.theta, save_path=image_path(output_path_adjusted), correction=alpha)
plt.imshow(train_set_x_orig[index]) plt.show() print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") + "' picture.") ''' # Flatten the images train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T # Normalise image values train_set_x = train_set_x_flatten / 255. test_set_x = test_set_x_flatten / 255. # Create model instance model = LogisticRegression() # Fit model to the data model.fit(train_set_x, train_set_y) # Train the model model.train(2400, verbose=True) # Predict values predictions = model.predict(test_set_x) # Check accuracy model.print_accuracy(predictions, test_set_y) # Plot training loss model.plot_cost()
batch_size = 100 n_batches = int(Xtrain.shape[0] / batch_size) logReg = LogisticRegression(n_batches=n_batches, allow_early_stop=False) etas = [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5] acc_list = [] accuracys_train = [] costs_train = [] accuracys_test = [] costs_test = [] for eta in etas: a, b, c, d = logReg.fit(Xtrain, ytrain, eta=eta, n_epochs=2000, Xtest=Xtest, ytest=ytest) acc_list.append(logReg.accuracy(Xtest, ytest)) accuracys_train.append(a) costs_train.append(b) accuracys_test.append(c) costs_test.append(d) print("Accuracy vs. test data, own logreg:", acc_list[-1]) plt.figure(figsize=(10, 8)) plt.title("Accuracy score for varying learning rate, logistic regression") plt.xlabel("Epoch") plt.ylabel("Accuracy")
if args.test: test_file = args.test test = pd.read_csv(test_file) if test_file is None: print("Splitting train to accomodate for test set.") train, test = train_test_split(train, test_size=0.2) train_Y = train['labels'].values train_X = train.drop(['labels'], axis=1).values test_Y = test['labels'].values test_X = test.drop(['labels'], axis=1).values print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape) logreg = LogisticRegression(learning_rate=lr, epochs=epochs, initialiser=init, verbose=verbose) logreg.fit(train_X, train_Y) predictions = logreg.predict(test_X) if args.output == ".": args.output = os.getcwd() with open(args.output + "/classification_report.txt", 'w') as f: f.write(str(classification_report(test_Y, predictions))) test['predictions'] = predictions test.to_csv(args.output + "/predictions.csv")
def main(train_path, valid_path, test_path, save_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on t-labels, 2. on y-labels, 3. on y-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. save_path: Path to save predictions. """ output_path_true = save_path.replace(WILDCARD, 'true') output_path_naive = save_path.replace(WILDCARD, 'naive') output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') # *** START CODE HERE *** # Part (a): Train and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='t') x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='t') from logreg import LogisticRegression clf = LogisticRegression() clf.fit(x_train, y_train) print(clf.theta) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plot_decision_line(clf.theta, x_valid, ax) plt.savefig("posonly_all_observed.png") plt.show() # Make sure to save predicted probabilities to output_path_true using np.savetxt() # Part (b): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='y') x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='y') from logreg import LogisticRegression clf = LogisticRegression() clf.fit(x_train, y_train) print(clf.theta) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plot_decision_line(clf.theta, x_valid, ax) plt.savefig("naive_training_partial.png") plt.show() # Make sure to save predicted probabilities to output_path_naive using np.savetxt() # Part (f): Apply correction factor using validation set and test on true labels clf = LogisticRegression() clf.fit(x_train, y_train) #decition y_pred = clf.predict(x_valid) print(y_pred) fig, ax = plt.subplots(1, 1, figsize=(12, 8)) ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int)) ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max()) plt.show()