def main(train_path, eval_path, pred_path): """Problem 1(e): Gaussian discriminant analysis (GDA) Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # *** START CODE HERE *** # Train a GDA classifier # NOTE Drop x0 = 1 convention used in regression examples # Will need to account for this to write in terms of theta x_train, y_train = util.load_dataset(train_path, add_intercept=False) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False) model = GDA() model.fit(x_train, y_train) predictions = model.predict(x_eval) np.savetxt(pred_path, predictions) # Train Logistic regression classifier x_train, y_train = util.load_dataset(train_path, add_intercept=True) x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True) model2 = LogisticRegression() model2.fit(x_train, y_train) # Plot decision boundary on validation set # Compare decision boundary with logistic thetas = [model.theta, model2.theta] fig_path = pred_path[:-4] + "_fig.jpg" colours = ["red", "orange"] title = "LinearReg (Orange) vs. GDA (Red)" util.plot_multiple(x_eval, y_eval, thetas, colours, fig_path, title=title)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels model = LogisticRegression() train_x, train_t = util.load_dataset(train_path, label_col="t", add_intercept=True) model.fit(train_x, train_t) val_x, val_y = util.load_dataset(valid_path, label_col='t', add_intercept=True) y_pred = model.predict(val_x) np.savetxt(pred_path_c, y_pred) util.plot(val_x, val_y, model.theta, 'output') # Make sure to save outputs to pred_path_c # Part (d): Train on y-labels and test on true labels train_x, train_y = util.load_dataset(train_path, label_col='y', add_intercept=True) model.fit(train_x, train_y) val_x, val_y = util.load_dataset(valid_path, label_col='t', add_intercept=True) y_pred = model.predict(val_x) np.savetxt(pred_path_d, y_pred) util.plot(val_x, val_y, model.theta, 'output') # Make sure to save outputs to pred_path_d # Part (e): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to pred_path_e test_x, test_t = util.load_dataset(test_path, label_col="t", add_intercept=True) y_pred = model.predict1(val_x) alpha = np.sum(y_pred[val_y == 1]) / np.sum(val_y) y_pred = model.predict2(test_x, alpha) np.savetxt(pred_path_e, y_pred) model.theta[0] -= np.log(alpha / (2 - alpha)) util.plot(test_x, test_t, model.theta, 'output')
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # Part (c): Train and test on true labels x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression() model.fit(x_train, t_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_c)) np.savetxt(pred_path_c, t_pred) # Part (d): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_d)) np.savetxt(pred_path_d, t_pred) # Part (e): Apply correction factor using validation set and test on true labels x_val, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True) y_pred = model.predict(x_val) alpha = y_pred[y_val == 1].sum() / (y_val == 1).sum() correction = 1 + (np.log(2 / alpha - 1) / model.theta[0]) util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_e), correction=correction) np.savetxt(pred_path_e, t_pred)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True) x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True) clf = LogisticRegression() clf.fit(x_train, t_train) result_t = clf.predict(x_test) np.savetxt(pred_path_c, result_t) util.plot(x_test, t_test, clf.theta) # Make sure to save outputs to pred_path_c # Part (d): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, 'y', add_intercept=True) x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True) clf2 = LogisticRegression() clf2.fit(x_train, y_train) result_y = clf2.predict(x_test) np.savetxt(pred_path_d, result_y) util.plot(x_test, t_test, clf2.theta) # Make sure to save outputs to pred_path_d # Part (e): Apply correction factor using validation set and test on true labels x_val, y_val = util.load_dataset(valid_path, 'y', add_intercept=True) result_y_valid = clf2.predict(x_val) alpha = 0 count = 0 for h in range(result_y_valid.shape[0]): if y_val[h] == 1: count += 1 alpha += result_y_valid[h] alpha = alpha / count result_t_readjust = result_y / alpha np.savetxt(pred_path_e, result_t_readjust) util.plot(x_test, t_test, clf2.theta, alpha)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels # Make sure to save outputs to pred_path_c x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=True) x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True) initial_theta = np.zeros(x_train.shape[1]) log_reg_c = LogisticRegression(theta_0=initial_theta) log_reg_c.fit(x_train, y_train) util.plot(x_test, y_test, log_reg_c.theta, pred_path_c + ".png") y_pred = log_reg_c.predict(x_test) np.savetxt(pred_path_c, y_pred) # Part (d): Train on y-labels and test on true labels # Make sure to save outputs to pred_path_d x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True) initial_theta = np.zeros(x_train.shape[1]) log_reg_d = LogisticRegression(theta_0=initial_theta) log_reg_d.fit(x_train, y_train) util.plot(x_test, y_test, log_reg_d.theta, pred_path_d + ".png") y_pred = log_reg_d.predict(x_test) np.savetxt(pred_path_d, y_pred) # Part (e): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to pred_path_e # *** END CODER HERE x_valid, y_valid = util.load_dataset(valid_path, label_col='y', add_intercept=True) alpha = log_reg_d.predict(x_valid[y_valid == 1]).mean() util.plot(x_test, y_test, log_reg_d.theta, pred_path_e + ".png", correction=alpha) y_pred_corrected = y_pred/alpha np.savetxt(pred_path_e, y_pred_corrected)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** ####################################################################################### # Problem (c) x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) model_t = LogisticRegression() model_t.fit(x_train, t_train) util.plot(x_test, t_test, model_t.theta, 'output/p02c.png') t_pred_c = model_t.predict(x_test) np.savetxt(pred_path_c, t_pred_c > 0.5, fmt='%d') ####################################################################################### # Problem (d) x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) model_y = LogisticRegression() model_y.fit(x_train, y_train) util.plot(x_test, y_test, model_y.theta, 'output/p02d.png') y_pred = model_y.predict(x_test) np.savetxt(pred_path_d, y_pred > 0.5, fmt='%d') ####################################################################################### # Problem (e) x_valid, y_valid = util.load_dataset(valid_path, label_col='y', add_intercept=True) alpha = np.mean(model_y.predict(x_valid)) correction = 1 + np.log(2 / alpha - 1) / model_y.theta[0] util.plot(x_test, t_test, model_y.theta, 'output/p02e.png', correction) t_pred_e = y_pred / alpha np.savetxt(pred_path_e, t_pred_e > 0.5, fmt='%d')
def main(train_path, valid_path, test_path, condition, intercept): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on t-labels, 3. on t-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. """ x_test, y_test_y = util.load_dataset(test_path, label_col='y', add_intercept=False) _, y_test_t = util.load_dataset(test_path, label_col='t', add_intercept=False) # logistic regression on y-labels or t_labels with correction factor alpha if condition == 1 or condition == 3: x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=False) # logistic regression on t-labels elif condition == 2: x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=False) else: return "Wrong condition: expecting 1, 2, or 3." model = LogisticRegression(intercept) model.fit(x_train, y_train) # for purposes of this exercise, let us just use a probability cut-off of 50% if condition == 3: y_pred = model.predict(x_test) alpha = y_pred[y_test_y == 1].sum() / (y_test_y == 1).sum() y_pred = y_pred / alpha y_pred = (y_pred >= 0.5).astype(np.float64) model.plot_with_decision_boundary(x_test, y_test_t, 0.5, alpha) else: y_pred = model.predict(x_test, 0.5) model.plot_with_decision_boundary(x_test, y_test_t, 0.5) acc = accuracy(y_pred, y_test_t) print(f'Accuracy achieved is {acc * 100: 0.2f} %')
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels # Make sure to save outputs to pred_path_c x_train, t_train = util.load_dataset(train_path, label_col='t') x_test, t_test = util.load_dataset(test_path, label_col='t') clf = LogisticRegression() clf.fit(x_train, t_train) t_pred = clf.predict(x_test) np.savetxt(pred_path_c, t_pred, "%d") util.plot(x_test, t_test, clf.theta, pred_path_c + ".png") # Part (d): Train on y-labels and test on true labels # Make sure to save outputs to pred_path_d x_train, y_train = util.load_dataset(train_path, label_col='y') x_test, y_test = util.load_dataset(test_path, label_col='y') clf = LogisticRegression() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) np.savetxt(pred_path_d, y_pred, fmt="%d") util.plot(x_test, t_test, clf.theta, pred_path_d + ".png") # Part (e): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to pred_path_e x_valid, y_valid = util.load_dataset(valid_path, label_col='y') clf = LogisticRegression() clf.fit(x_train, y_train) y_valid_pred = clf.predict_score(x_valid) alpha, count = 0, 0 for i in range(y_valid.shape[0]): if y_valid[i]: alpha += y_valid_pred[i] count += 1 alpha /= count y_pred = clf.predict_score(x_test) y_pred = ((y_pred / alpha) >= 0.5).astype(np.int) np.savetxt(pred_path_e, y_pred, fmt="%d") util.plot(x_test, t_test, clf.theta / alpha, pred_path_e + ".png", correction=1 + np.log(2 / alpha - 1) / clf.theta[0])
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** #(c) use t x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=True) LR = LogisticRegression() LR.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, add_intercept=True) y_pred = LR.predict(x_test) util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_c)) np.savetxt(pred_path_c, y_pred) #(d) use y x_train, y_train = util.load_dataset(train_path, add_intercept=True) LR = LogisticRegression() LR.fit(x_train, y_train) x_test, y_test = util.load_dataset(test_path, add_intercept=True) y_pred = LR.predict(x_test) util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_d)) np.savetxt(pred_path_d, y_pred) #(e) use a held-out validation set to estimate alpha x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) LR = LogisticRegression() LR.fit(x_train, y_train) x_valid = util.load_dataset(test_path, add_intercept=True)[0][ util.load_dataset(test_path, add_intercept=True)[1] == 1, :] alpha = np.mean(LR.predict(x_valid)) x_test, y_test = util.load_dataset(test_path, add_intercept=True) y_pred = LR.predict(x_test) / alpha LR.theta[0] = LR.theta[0] + np.log(2 / alpha - 1) util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_e)) np.savetxt(pred_path_e, y_pred)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) model = LogisticRegression(max_iter=10000) model.fit(x_train, t_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, "{}.png".format(pred_path_c)) m, n = x_test.shape t = t_pred.copy() for i in range(m): if t[i] >= 0.5: t[i] = 1 else: t[i] = 0 print(m) print((t_test == t).sum()) # Make sure to save outputs to pred_path_c np.savetxt(pred_path_c, t_pred) # Part (d): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model = LogisticRegression(max_iter=10000) model.fit(x_train, y_train) x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model.predict(x_test) util.plot(x_test, t_test, model.theta, "{}.png".format(pred_path_d)) # Make sure to save outputs to pred_path_d np.savetxt(pred_path_d, t_pred) # Part (e): Apply correction factor using validation set and test on true labels x_val, y_val = util.load_dataset(valid_path, add_intercept=True) y_pred = model.predict(x_val) alpha = (y_pred[y_val == 1].sum()) / y_val.sum() print(alpha) #how to get this correction factor correction = 1 + (np.log(2 / alpha - 1) / model.theta[0]) util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_e), correction=correction) np.savetxt(pred_path_e, t_pred)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels # Make sure to save outputs to pred_path_c x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) lr_c = LogisticRegression() lr_c.fit(x_train, t_train) x_val, t_val = util.load_dataset(valid_path, label_col='t', add_intercept=True) # print(np.mean(t_val == (lr_c.predict(x_val) > 0.5))) y_pred_c = lr_c.predict(x_val) np.savetxt(pred_path_c, y_pred_c) # Part (d): Train on y-labels and test on true labels # Make sure to save outputs to pred_path_d _, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) lr_d = LogisticRegression() lr_d.fit(x_train, y_train) _, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True) # print(np.mean(t_val == (lr_c.predict(x_val) > 0.5))) y_pred_d = lr_d.predict(x_val) np.savetxt(pred_path_d, y_pred_d) # Part (e): Apply correction factor using validation set and test on true labels # Plot and use np.savetxt to save outputs to pred_path_e alpha = y_pred_d[y_val == 1].mean() # p(t=1| x) = p(y=1| x) / alpha y_pred_e = y_pred_d / alpha np.savetxt(pred_path_e, y_pred_e) # plot results of c, d util.plot(x_val, t_val, lr_c.theta, 'output/p02c.png') util.plot(x_val, t_val, lr_d.theta, 'output/p02d.png') # calculate correction: # a1 * x1 + a2 * x2 + a0 = beta # x2 + a1 / a2 * x1 + a0 / a2 = beta / a2 # x2 = beta / a2 - (a1 / a2 * x1 + a0 / a2) # x2 = - ((a0 - beta) / a2 + a1 / a2) # correction = (a0 - beta) / a0 # beta = theta.dot(x) # 0.5 * alpha = 1. / (1. + np.exp(beta)) # beta = np.log(2. / alpha - 1.) # correction = 1. - np.log(2. / alpha - 1.) correction = 1. - np.log(2. / alpha - 1.) # plot result of e util.plot(x_val, t_val, lr_c.theta, 'output/p02e.png', correction=correction)
def main(train_path, valid_path, test_path, pred_path,k=0): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels clf = LogisticRegression() x_train,y_train = util.load_dataset(train_path,label_col = 't',add_intercept = True) theta = clf.fit(x_train,y_train) x_test,y_test = util.load_dataset(test_path, label_col = 't', add_intercept = True) p = clf.predict(x_test) if(k == 1): ind = p < 0.5 p[ind] = 0 index = p >= 0.5 p[index] = 1 return p if(k==0): np.savetxt(pred_path_c, p, delimiter = ',') sp = 'output/p02_plot' util.plot(x_test,y_test,theta,sp) # Make sure to save outputs to pred_path_c # Part (d): Train on y-labels and test on true labels clf.theta = None x_train,y_train = util.load_dataset(train_path,label_col = 'y',add_intercept = True) theta = clf.fit(x_train,y_train) x_test,y_test = util.load_dataset(test_path,label_col = 't',add_intercept = True) p = clf.predict(x_test) if(k == 2): ind = p < 0.5 p[ind] = 0 index = p >= 0.5 p[index] = 1 return p if(k==0): np.savetxt(pred_path_d,p,delimiter = ',') sp = 'output/p02d_plot' util.plot(x_test,y_test,theta,sp) # Make sure to save outputs to pred_path_d # Part (e): Apply correction factor using validation set and test on true labels x_valid,y_valid = util.load_dataset(valid_path,label_col = 'y',add_intercept = True) a = y_valid == 1 p1 = clf.predict(x_valid) # alpha = p1[y_valid == 1].sum() / (y_valid == 1).sum() alpha = np.sum(p1[a])/(np.sum(y_valid[a])) # print(alpha) correction = 1 + (np.log(2 / alpha - 1) / clf.theta[0]) x_test,y_test = util.load_dataset(test_path,label_col = 't',add_intercept = True) P = clf.predict(x_test) P = P/alpha if(k==3): ind = P < 0.5 P[ind] = 0 index = P >= 0.5 P[index] = 1 return P if(k==0): np.savetxt(pred_path_e,p,delimiter = ',') sp = 'output/p02e_plot' util.plot(x_test,y_test,theta,sp,correction = correction)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') ####################### # *** START CODE HERE *** # Part (c): Train and test on true labels x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True) theta_0 = np.zeros(shape=(3, )) model_t = LogisticRegression(theta_0=theta_0) model_t.fit(x_train, t_train) # predict using the trained model x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True) t_pred = model_t.predict(x_test) # Plot decision boundary on top of test set util.plot(x_test, t_test, model_t.theta, 'output/2c_{ds}.pdf'.format(ds=test_path.split('/')[-1])) # Use np.savetxt to save predictions on eval set to pred_path np.savetxt(pred_path, t_pred) ####################### # Part (d): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True) model_y = LogisticRegression(theta_0=theta_0) model_y.fit(x_train, y_train) # predict using the trained model x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True) t_pred = model_t.predict(x_test) # Plot decision boundary on top of test set util.plot(x_test, y_test, model_y.theta, 'output/2d_{ds}.pdf'.format(ds=test_path.split('/')[-1])) ####################### # Part (e): Apply correction factor using validation set and test on true labels x_valid, y_valid = util.load_dataset(valid_path, label_col='y', add_intercept=True) pred_y_valid = model_y.predict(x_valid) V_plus_mask = (y_valid == 1) alpha = np.mean(pred_y_valid[V_plus_mask]) # Plot decision boundary on top of test set util.plot(x_test, y_test, model_y.theta, 'output/2e_{ds}.pdf'.format(ds=test_path.split('/')[-1]), correction=alpha)
def main(train_path, valid_path, test_path, pred_path): """Problem 2: Logistic regression for incomplete, positive-only labels. Run under the following conditions: 1. on y-labels, 2. on l-labels, 3. on l-labels with correction factor alpha. Args: train_path: Path to CSV file containing training set. valid_path: Path to CSV file containing validation set. test_path: Path to CSV file containing test set. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, 'c') pred_path_d = pred_path.replace(WILDCARD, 'd') pred_path_e = pred_path.replace(WILDCARD, 'e') # *** START CODE HERE *** # Part (c): Train and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='t') lr = LogisticRegression(verbose=False) lr.fit(x_train, y_train) x_eval, y_eval = util.load_dataset(test_path, add_intercept=True, label_col='t') y_pred = np.empty_like(y_eval) for i in range(len(x_eval)): y_pred[i] = lr.predict(x_eval[i]) # np.savetxt(pred_path_c, np.column_stack((x_eval, y_pred)), delimiter=',') np.savetxt(pred_path_c, y_pred, delimiter=',') util.plot(x_eval, y_eval, lr.theta, 'output/p02c') # Part (d): Train on y-labels and test on true labels x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='y') lr2 = LogisticRegression(verbose=False) lr2.fit(x_train, y_train) x_eval2, y_eval2 = util.load_dataset(test_path, add_intercept=True, label_col='t') y_pred2 = np.empty_like(y_eval2) for i in range(len(x_eval2)): y_pred2[i] = lr2.predict(x_eval2[i]) # np.savetxt(pred_path_d, np.column_stack((x_eval2, y_pred2)), delimiter=',') np.savetxt(pred_path_d, y_pred2, delimiter=',') util.plot(x_eval2, y_eval2, lr2.theta, 'output/p02d') # Part (e): Apply correction factor using validation set and test on true labels x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='t') y_pred = np.empty_like(y_valid) alpha_n = 0 alpha_d = 0 for i in range(len(x_valid)): if y_valid[i] == 1: alpha_d += 1 alpha_n += lr2.predict(x_valid[i]) alpha = alpha_n / alpha_d for i in range(len(x_eval)): y_pred[i] = lr.predict(x_eval[i]) / alpha # np.savetxt(pred_path_e, np.column_stack((x_eval2, y_pred2 / alpha)), delimiter=',') np.savetxt(pred_path_e, y_pred2 / alpha, delimiter=',') util.plot(x_eval2, y_eval2, lr2.theta, 'output/p02e', correction=alpha)