예제 #1
0
def main(train_path, eval_path, pred_path):
    """Problem 1(e): Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # *** START CODE HERE ***
    # Train a GDA classifier
    # NOTE Drop x0 = 1 convention used in regression examples
    # Will need to account for this to write in terms of theta
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=False)
    model = GDA()
    model.fit(x_train, y_train)

    predictions = model.predict(x_eval)
    np.savetxt(pred_path, predictions)

    # Train Logistic regression classifier
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_eval, y_eval = util.load_dataset(eval_path, add_intercept=True)
    model2 = LogisticRegression()
    model2.fit(x_train, y_train)

    # Plot decision boundary on validation set
    # Compare decision boundary with logistic
    thetas = [model.theta, model2.theta]
    fig_path = pred_path[:-4] + "_fig.jpg"
    colours = ["red", "orange"]
    title = "LinearReg (Orange) vs. GDA (Red)"
    util.plot_multiple(x_eval, y_eval, thetas, colours, fig_path, title=title)
예제 #2
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels

    model = LogisticRegression()
    train_x, train_t = util.load_dataset(train_path,
                                         label_col="t",
                                         add_intercept=True)
    model.fit(train_x, train_t)
    val_x, val_y = util.load_dataset(valid_path,
                                     label_col='t',
                                     add_intercept=True)
    y_pred = model.predict(val_x)
    np.savetxt(pred_path_c, y_pred)
    util.plot(val_x, val_y, model.theta, 'output')
    # Make sure to save outputs to pred_path_c

    # Part (d): Train on y-labels and test on true labels
    train_x, train_y = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model.fit(train_x, train_y)
    val_x, val_y = util.load_dataset(valid_path,
                                     label_col='t',
                                     add_intercept=True)
    y_pred = model.predict(val_x)
    np.savetxt(pred_path_d, y_pred)
    util.plot(val_x, val_y, model.theta, 'output')
    # Make sure to save outputs to pred_path_d

    # Part (e): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to pred_path_e
    test_x, test_t = util.load_dataset(test_path,
                                       label_col="t",
                                       add_intercept=True)
    y_pred = model.predict1(val_x)
    alpha = np.sum(y_pred[val_y == 1]) / np.sum(val_y)
    y_pred = model.predict2(test_x, alpha)
    np.savetxt(pred_path_e, y_pred)
    model.theta[0] -= np.log(alpha / (2 - alpha))
    util.plot(test_x, test_t, model.theta, 'output')
예제 #3
0
def main(train_path, valid_path, test_path, condition, intercept):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on t-labels,
        3. on t-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
    """
    x_test, y_test_y = util.load_dataset(test_path, label_col='y', add_intercept=False)
    _, y_test_t = util.load_dataset(test_path, label_col='t', add_intercept=False)

    # logistic regression on y-labels or t_labels with correction factor alpha
    if condition == 1 or condition == 3:
        x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=False)

    # logistic regression on t-labels
    elif condition == 2:
         x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=False)

    else:
        return "Wrong condition: expecting 1, 2, or 3."

    model = LogisticRegression(intercept)
    model.fit(x_train, y_train)

    # for purposes of this exercise, let us just use a probability cut-off of 50%
    if condition == 3:
        y_pred = model.predict(x_test)
        alpha = y_pred[y_test_y == 1].sum() / (y_test_y == 1).sum()
        y_pred = y_pred / alpha
        y_pred = (y_pred >= 0.5).astype(np.float64)
        model.plot_with_decision_boundary(x_test, y_test_t, 0.5, alpha)
    else:
        y_pred = model.predict(x_test, 0.5)
        model.plot_with_decision_boundary(x_test, y_test_t, 0.5)

    acc = accuracy(y_pred, y_test_t)
    print(f'Accuracy achieved is {acc * 100: 0.2f} %')
예제 #4
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    model = LogisticRegression()
예제 #5
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model = LogisticRegression(max_iter=10000)
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, "{}.png".format(pred_path_c))

    m, n = x_test.shape
    t = t_pred.copy()
    for i in range(m):
        if t[i] >= 0.5:
            t[i] = 1
        else:
            t[i] = 0
    print(m)
    print((t_test == t).sum())
    # Make sure to save outputs to pred_path_c
    np.savetxt(pred_path_c, t_pred)

    # Part (d): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model = LogisticRegression(max_iter=10000)
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, "{}.png".format(pred_path_d))
    # Make sure to save outputs to pred_path_d
    np.savetxt(pred_path_d, t_pred)
    # Part (e): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path, add_intercept=True)
    y_pred = model.predict(x_val)
    alpha = (y_pred[y_val == 1].sum()) / y_val.sum()
    print(alpha)
    #how to get this correction factor
    correction = 1 + (np.log(2 / alpha - 1) / model.theta[0])
    util.plot(x_test,
              t_test,
              model.theta,
              '{}.png'.format(pred_path_e),
              correction=correction)
    np.savetxt(pred_path_e, t_pred)
예제 #6
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    # Make sure to save outputs to pred_path_c
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    lr_c = LogisticRegression()
    lr_c.fit(x_train, t_train)

    x_val, t_val = util.load_dataset(valid_path,
                                     label_col='t',
                                     add_intercept=True)
    # print(np.mean(t_val == (lr_c.predict(x_val) > 0.5)))
    y_pred_c = lr_c.predict(x_val)
    np.savetxt(pred_path_c, y_pred_c)

    # Part (d): Train on y-labels and test on true labels
    # Make sure to save outputs to pred_path_d
    _, y_train = util.load_dataset(train_path,
                                   label_col='y',
                                   add_intercept=True)
    lr_d = LogisticRegression()
    lr_d.fit(x_train, y_train)

    _, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True)
    # print(np.mean(t_val == (lr_c.predict(x_val) > 0.5)))
    y_pred_d = lr_d.predict(x_val)
    np.savetxt(pred_path_d, y_pred_d)

    # Part (e): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to pred_path_e
    alpha = y_pred_d[y_val == 1].mean()
    # p(t=1| x) = p(y=1| x) / alpha
    y_pred_e = y_pred_d / alpha
    np.savetxt(pred_path_e, y_pred_e)

    # plot results of c, d
    util.plot(x_val, t_val, lr_c.theta, 'output/p02c.png')
    util.plot(x_val, t_val, lr_d.theta, 'output/p02d.png')

    # calculate correction:
    # a1 * x1 + a2 * x2 + a0 = beta
    # x2 + a1 / a2 * x1 + a0 / a2 = beta / a2
    # x2 = beta / a2 - (a1 / a2 * x1 + a0 / a2)
    # x2 = - ((a0 - beta) / a2 + a1 / a2)
    # correction = (a0 - beta) / a0

    # beta = theta.dot(x)
    # 0.5 * alpha = 1. / (1. + np.exp(beta))
    # beta = np.log(2. / alpha - 1.)
    # correction = 1. - np.log(2. / alpha - 1.)
    correction = 1. - np.log(2. / alpha - 1.)

    # plot result of e
    util.plot(x_val,
              t_val,
              lr_c.theta,
              'output/p02e.png',
              correction=correction)
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    #######################################################################################
    # Problem (c)
    x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True)

    model_t = LogisticRegression()
    model_t.fit(x_train, t_train)

    util.plot(x_test, t_test, model_t.theta, 'output/p02c.png')

    t_pred_c = model_t.predict(x_test)
    np.savetxt(pred_path_c, t_pred_c > 0.5, fmt='%d')
    #######################################################################################
    # Problem (d)
    x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, label_col='y', add_intercept=True)

    model_y = LogisticRegression()
    model_y.fit(x_train, y_train)

    util.plot(x_test, y_test, model_y.theta, 'output/p02d.png')

    y_pred = model_y.predict(x_test)
    np.savetxt(pred_path_d, y_pred > 0.5, fmt='%d')
    #######################################################################################  
    # Problem (e)
    x_valid, y_valid = util.load_dataset(valid_path, label_col='y', add_intercept=True)

    alpha = np.mean(model_y.predict(x_valid))

    correction = 1 + np.log(2 / alpha - 1) / model_y.theta[0]
    util.plot(x_test, t_test, model_y.theta, 'output/p02e.png', correction)

    t_pred_e = y_pred / alpha
    np.savetxt(pred_path_e, t_pred_e > 0.5, fmt='%d')
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    # Make sure to save outputs to pred_path_c
    x_train, t_train = util.load_dataset(train_path, label_col='t')
    x_test, t_test = util.load_dataset(test_path, label_col='t')
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    t_pred = clf.predict(x_test)
    np.savetxt(pred_path_c, t_pred, "%d")
    util.plot(x_test, t_test, clf.theta, pred_path_c + ".png")

    # Part (d): Train on y-labels and test on true labels
    # Make sure to save outputs to pred_path_d
    x_train, y_train = util.load_dataset(train_path, label_col='y')
    x_test, y_test = util.load_dataset(test_path, label_col='y')
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    np.savetxt(pred_path_d, y_pred, fmt="%d")
    util.plot(x_test, t_test, clf.theta, pred_path_d + ".png")

    # Part (e): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to pred_path_e
    x_valid, y_valid = util.load_dataset(valid_path, label_col='y')
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_valid_pred = clf.predict_score(x_valid)
    alpha, count = 0, 0
    for i in range(y_valid.shape[0]):
        if y_valid[i]:
            alpha += y_valid_pred[i]
            count += 1
    alpha /= count
    y_pred = clf.predict_score(x_test)
    y_pred = ((y_pred / alpha) >= 0.5).astype(np.int)
    np.savetxt(pred_path_e, y_pred, fmt="%d")
    util.plot(x_test,
              t_test,
              clf.theta / alpha,
              pred_path_e + ".png",
              correction=1 + np.log(2 / alpha - 1) / clf.theta[0])
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model_c = LogisticRegression()
    model_c.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model_c.predict(x_test)

    # Make sure to save outputs to pred_path_c
    util.plot(x_test, t_test, model_c.theta, '{}.png'.format(pred_path_c))
    np.savetxt(pred_path_c, t_pred)
    # Part (d): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    model_d = LogisticRegression()
    model_d.fit(x_train, y_train)
    # test on true labels
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model_d.predict(x_test)

    # Make sure to save outputs to pred_path_d
    util.plot(x_test, t_test, model_d.theta, '{}.png'.format(pred_path_d))
    np.savetxt(pred_path_d, t_pred)

    # Part (e): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col='y',
                                     add_intercept=True)
    model_e = LogisticRegression()
    model_e.fit(x_val, y_val)
    y_pred = model_e.predict(x_val)
    alpha = y_pred[y_val == 1].sum() / (y_val == 1).sum()
    #print(alpha)
    #print(model_e.theta[0])
    correction = 1 + (np.log(2 / 1 - alpha) / model_e.theta[0])

    # Plot and use np.savetxt to save outputs to pred_path_e
    util.plot(x_test,
              t_test,
              model_e.theta,
              '{}.png'.format(pred_path_e),
              correction=correction)
    np.savetxt(pred_path_e, t_pred)
예제 #10
0
def main(train_path, valid_path, test_path, pred_path,k=0):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    clf = LogisticRegression()
    x_train,y_train = util.load_dataset(train_path,label_col = 't',add_intercept = True)
    theta = clf.fit(x_train,y_train)
    x_test,y_test = util.load_dataset(test_path, label_col = 't', add_intercept = True)
    p = clf.predict(x_test)
    
    if(k == 1):
        ind = p < 0.5
        p[ind] = 0
        index = p >= 0.5
        p[index] = 1
        return p
    
    if(k==0):
        np.savetxt(pred_path_c, p, delimiter = ',')
        sp = 'output/p02_plot'
        util.plot(x_test,y_test,theta,sp)
    # Make sure to save outputs to pred_path_c
    
    
    # Part (d): Train on y-labels and test on true labels
    clf.theta = None
    x_train,y_train = util.load_dataset(train_path,label_col = 'y',add_intercept = True)
    theta = clf.fit(x_train,y_train)
    x_test,y_test = util.load_dataset(test_path,label_col = 't',add_intercept = True)
    p = clf.predict(x_test)
    
    if(k == 2):
        ind = p < 0.5
        p[ind] = 0
        index = p >= 0.5
        p[index] = 1
        return p
    
    if(k==0):
        np.savetxt(pred_path_d,p,delimiter = ',')
        sp = 'output/p02d_plot'
        util.plot(x_test,y_test,theta,sp)
    # Make sure to save outputs to pred_path_d
    
    
    # Part (e): Apply correction factor using validation set and test on true labels
    x_valid,y_valid = util.load_dataset(valid_path,label_col = 'y',add_intercept = True)
    a = y_valid == 1
    p1 = clf.predict(x_valid)
#     alpha = p1[y_valid == 1].sum() / (y_valid == 1).sum()
    alpha = np.sum(p1[a])/(np.sum(y_valid[a]))
    # print(alpha)
    
    correction = 1 + (np.log(2 / alpha - 1) / clf.theta[0])
    
    x_test,y_test = util.load_dataset(test_path,label_col = 't',add_intercept = True)
    P = clf.predict(x_test)
    P = P/alpha
    if(k==3):
        ind = P < 0.5
        P[ind] = 0
        index = P >= 0.5
        P[index] = 1
        return P
    if(k==0):
        np.savetxt(pred_path_e,p,delimiter = ',')
        sp = 'output/p02e_plot'
        util.plot(x_test,y_test,theta,sp,correction = correction)
예제 #11
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels

    x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    result_t = clf.predict(x_test)
    np.savetxt(pred_path_c, result_t)
    util.plot(x_test, t_test, clf.theta)

    # Make sure to save outputs to pred_path_c
    # Part (d): Train on y-labels and test on true labels

    x_train, y_train = util.load_dataset(train_path, 'y', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf2 = LogisticRegression()
    clf2.fit(x_train, y_train)
    result_y = clf2.predict(x_test)
    np.savetxt(pred_path_d, result_y)
    util.plot(x_test, t_test, clf2.theta)

    # Make sure to save outputs to pred_path_d
    # Part (e): Apply correction factor using validation set and test on true labels

    x_val, y_val = util.load_dataset(valid_path, 'y', add_intercept=True)
    result_y_valid = clf2.predict(x_val)
    alpha = 0
    count = 0
    for h in range(result_y_valid.shape[0]):
        if y_val[h] == 1:
            count += 1
            alpha += result_y_valid[h]

    alpha = alpha / count
    result_t_readjust = result_y / alpha
    np.savetxt(pred_path_e, result_t_readjust)
    util.plot(x_test, t_test, clf2.theta, alpha)
예제 #12
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    # Make sure to save outputs to pred_path_c
    x_train, y_train = util.load_dataset(train_path, label_col='t', add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True)

    initial_theta = np.zeros(x_train.shape[1])
    log_reg_c = LogisticRegression(theta_0=initial_theta)
    log_reg_c.fit(x_train, y_train)
    util.plot(x_test, y_test, log_reg_c.theta, pred_path_c + ".png")
    y_pred = log_reg_c.predict(x_test)
    np.savetxt(pred_path_c, y_pred)

    # Part (d): Train on y-labels and test on true labels
    # Make sure to save outputs to pred_path_d

    x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, label_col='t', add_intercept=True)

    initial_theta = np.zeros(x_train.shape[1])
    log_reg_d = LogisticRegression(theta_0=initial_theta)
    log_reg_d.fit(x_train, y_train)
    util.plot(x_test, y_test, log_reg_d.theta, pred_path_d + ".png")
    y_pred = log_reg_d.predict(x_test)
    np.savetxt(pred_path_d, y_pred)

    # Part (e): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to pred_path_e
    # *** END CODER HERE
    
    x_valid, y_valid = util.load_dataset(valid_path, label_col='y', add_intercept=True)
    alpha = log_reg_d.predict(x_valid[y_valid == 1]).mean()
    
    util.plot(x_test, y_test, log_reg_d.theta, pred_path_e + ".png", correction=alpha)

    y_pred_corrected = y_pred/alpha
    np.savetxt(pred_path_e, y_pred_corrected)
예제 #13
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    #(c) use t
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    LR = LogisticRegression()
    LR.fit(x_train, y_train)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    y_pred = LR.predict(x_test)
    util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_c))
    np.savetxt(pred_path_c, y_pred)

    #(d) use y
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    LR = LogisticRegression()
    LR.fit(x_train, y_train)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    y_pred = LR.predict(x_test)
    util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_d))
    np.savetxt(pred_path_d, y_pred)

    #(e) use a held-out validation set to estimate alpha
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    LR = LogisticRegression()
    LR.fit(x_train, y_train)
    x_valid = util.load_dataset(test_path, add_intercept=True)[0][
        util.load_dataset(test_path, add_intercept=True)[1] == 1, :]
    alpha = np.mean(LR.predict(x_valid))
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    y_pred = LR.predict(x_test) / alpha
    LR.theta[0] = LR.theta[0] + np.log(2 / alpha - 1)
    util.plot(x_test, y_test, LR.theta, '{}.png'.format(pred_path_e))
    np.savetxt(pred_path_e, y_pred)
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path, label_col='t', add_intercept=True)
    model = LogisticRegression(max_iter=1000)
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_c.split(".")[0]))
    # print((t_pred == t_test).sum())
    np.savetxt(pred_path_c, t_pred)

    # Part (d): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path, label_col='y', add_intercept=True)
    model = LogisticRegression(max_iter=1000)
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path, label_col='t', add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_d.split(".")[0]))
    np.savetxt(pred_path_d, t_pred)

    # Part (e): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path, label_col='y', add_intercept=True)
    y_pred = model.predict(x_val)
    alpha = 1 / np.sum(y_val == 1) * np.sum(y_pred[y_val == 1])
    # print(alpha)
    correction = 1 + (np.log(2 / alpha - 1) / model.theta[0])
    # correction = 1
    util.plot(x_test, t_test, model.theta, '{}.png'.format(pred_path_e.split(".")[0]), correction=correction)
    np.savetxt(pred_path_e, t_pred)
예제 #15
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    #######################
    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    theta_0 = np.zeros(shape=(3, ))
    model_t = LogisticRegression(theta_0=theta_0)
    model_t.fit(x_train, t_train)

    # predict using the trained model
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model_t.predict(x_test)

    # Plot decision boundary on top of test set
    util.plot(x_test, t_test, model_t.theta,
              'output/2c_{ds}.pdf'.format(ds=test_path.split('/')[-1]))

    # Use np.savetxt to save predictions on eval set to pred_path
    np.savetxt(pred_path, t_pred)

    #######################
    # Part (d): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model_y = LogisticRegression(theta_0=theta_0)
    model_y.fit(x_train, y_train)

    # predict using the trained model
    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)
    t_pred = model_t.predict(x_test)

    # Plot decision boundary on top of test set
    util.plot(x_test, y_test, model_y.theta,
              'output/2d_{ds}.pdf'.format(ds=test_path.split('/')[-1]))

    #######################
    # Part (e): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path,
                                         label_col='y',
                                         add_intercept=True)
    pred_y_valid = model_y.predict(x_valid)
    V_plus_mask = (y_valid == 1)
    alpha = np.mean(pred_y_valid[V_plus_mask])

    # Plot decision boundary on top of test set
    util.plot(x_test,
              y_test,
              model_y.theta,
              'output/2e_{ds}.pdf'.format(ds=test_path.split('/')[-1]),
              correction=alpha)
예제 #16
0
def main(train_path, valid_path, test_path, pred_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on y-labels,
        2. on l-labels,
        3. on l-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, 'c')
    pred_path_d = pred_path.replace(WILDCARD, 'd')
    pred_path_e = pred_path.replace(WILDCARD, 'e')

    # *** START CODE HERE ***
    # Part (c): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='t')

    lr = LogisticRegression(verbose=False)
    lr.fit(x_train, y_train)

    x_eval, y_eval = util.load_dataset(test_path, add_intercept=True, label_col='t')
    y_pred = np.empty_like(y_eval)

    for i in range(len(x_eval)):
        y_pred[i] = lr.predict(x_eval[i])

    # np.savetxt(pred_path_c, np.column_stack((x_eval, y_pred)), delimiter=',')
    np.savetxt(pred_path_c, y_pred, delimiter=',')
    util.plot(x_eval, y_eval, lr.theta, 'output/p02c')

    # Part (d): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path, add_intercept=True, label_col='y')

    lr2 = LogisticRegression(verbose=False)
    lr2.fit(x_train, y_train)

    x_eval2, y_eval2 = util.load_dataset(test_path, add_intercept=True, label_col='t')
    y_pred2 = np.empty_like(y_eval2)

    for i in range(len(x_eval2)):
        y_pred2[i] = lr2.predict(x_eval2[i])

    # np.savetxt(pred_path_d, np.column_stack((x_eval2, y_pred2)), delimiter=',')
    np.savetxt(pred_path_d, y_pred2, delimiter=',')
    util.plot(x_eval2, y_eval2, lr2.theta, 'output/p02d')

    # Part (e): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True, label_col='t')
    y_pred = np.empty_like(y_valid)
    alpha_n = 0
    alpha_d = 0
    for i in range(len(x_valid)):
        if y_valid[i] == 1:
            alpha_d += 1
            alpha_n += lr2.predict(x_valid[i])

    alpha = alpha_n / alpha_d

    for i in range(len(x_eval)):
        y_pred[i] = lr.predict(x_eval[i]) / alpha

    # np.savetxt(pred_path_e, np.column_stack((x_eval2, y_pred2 / alpha)), delimiter=',')
    np.savetxt(pred_path_e, y_pred2 / alpha, delimiter=',')
    util.plot(x_eval2, y_eval2, lr2.theta, 'output/p02e', correction=alpha)