def logregA_varying_regularization(lam, regul1):
    pa_list = []
    ta_list = []
    total_ta = 0
    total_pa = 0

    for i in range(5):

        Log_ob = LogisticRegression(regLambda=lam, regNorm=regul1)

        Log_ob.fit(folds_X_complete[i], folds_y_complete[i])

        y_test = Log_ob.predict(X_test[i])
        pa_score = accuracy_score(y_test, y_complete[i])
        pa_list.append(pa_score)

        y_train = Log_ob.predict(folds_X_complete[i])
        ta_score = accuracy_score(y_train, folds_y_complete[i])
        ta_list.append(ta_score)

        total_pa = total_pa + pa_score
        total_ta = total_ta + ta_score

    pa = total_pa / 5
    ta = total_ta / 5

    return pa, ta, pa_list, ta_list
Пример #2
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true))
    np.savetxt(output_path_true, t_pred)
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive))
    np.savetxt(output_path_naive, t_pred)
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col='y',
                                     add_intercept=True)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    util.plot(x_test,
              t_test,
              model.theta,
              '{}.png'.format(output_path_adjusted),
              correction=alpha)
    np.savetxt(output_path_adjusted, pt_test)
Пример #3
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train_t = util.load_dataset(train_path,
                                           label_col='t',
                                           add_intercept=True)
    model = LogisticRegression()

    # Fit model on true labels
    model.fit(x_train, y_train_t)

    x_val, y_val_t = util.load_dataset(valid_path,
                                       label_col='t',
                                       add_intercept=True)

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_true[:-4])

    # Part (b): Train on y-labels and test on true labels
    _, y_train_y = util.load_dataset(train_path,
                                     label_col='y',
                                     add_intercept=True)

    model = LogisticRegression()

    # Train model on y-labels
    model.fit(x_train, y_train_y)

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_naive[:-4])
Пример #4
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # Part (a):
    x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg')
    np.savetxt(output_path_true, clf.predict(x_test))

    # Part (b):
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg')
    np.savetxt(output_path_naive, clf.predict(x_test))

    # Part (f):
    alpha = np.mean(clf.predict(x_valid[y_valid == 1]))
    np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha)
    clf.theta[0] += np.log(2 / alpha - 1)
    util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
Пример #5
0
def evaluatePerformance(numTrials=1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    #1000 trials
    num_folds = 10
    percent_incs = 10
    tree_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])
    log_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])

    #split the data
    k_fold = sklearn.cross_validation.KFold(len(y), n_folds=num_folds)

    for i in xrange(numTrials):
        #for each trial, shuffle the data
        #print 'Iteration: ', i+1
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        j = 0
        for train_index, test_index in k_fold:
            for k in xrange(percent_incs):
                #get the data splits for the current fold
                Xtrain, Xtest = X[train_index[0:(n/percent_incs)*(k+1)]], X[test_index]
                ytrain, ytest = y[train_index[0:(n/percent_incs)*(k+1)]], y[test_index]

                # train the decision tree
                clf = tree.DecisionTreeClassifier()
                clf = clf.fit(Xtrain, ytrain)

                # output tree predictions on the remaining data and check them
                tree_pred = clf.predict(Xtest)
                tree_accuracy[i*num_folds + j,k] = accuracy_score(ytest, tree_pred)

                #train logarithmic regression
                logregModel = LogisticRegression(alpha = 0.1, epsilon = 0.005)
                logregModel.fit(Xtrain, ytrain)

                #output logreg predictions on the remaining data and check them
                log_pred = logregModel.predict(Xtest)
                log_accuracy[i*num_folds + j,k] = accuracy_score(ytest, log_pred)

            j += 1

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(tree_accuracy[:,percent_incs-1])

    # TODO: update these statistics based on the results of your experiment
    stddevDecisionTreeAccuracy = np.std(tree_accuracy[:,percent_incs-1])
    meanLogisticRegressionAccuracy = np.mean(log_accuracy[:,percent_incs-1])
    stddevLogisticRegressionAccuracy = np.std(log_accuracy[:,percent_incs-1])

    #print graph
    tree_array = np.zeros(percent_incs)
    tree_array_std = np.zeros(percent_incs)
    log_array = np.zeros(percent_incs)
    log_array_std = np.zeros(percent_incs)
    for i in xrange(percent_incs):
        tree_array[i] = np.mean(tree_accuracy[:,i])
        tree_array_std[i] = np.std(tree_accuracy[:,i])
        log_array[i] = np.mean(log_accuracy[:,i])
        log_array_std[i] = np.std(log_accuracy[:,i])

    x_axis = (np.arange(percent_incs) + 1) * 10
    tree_plot = plt.errorbar(x=x_axis, y=tree_array, yerr=tree_array_std)
    log_plot = plt.errorbar(x=x_axis, y=log_array, yerr=log_array_std)
    plt.xlabel('Training Data Used (percentage)')
    plt.ylabel('Accuracy (mean)')
    plt.title('Learning Curve')
    plt.axis([10, 100, 0.0, 1.0])
    plt.grid(True)
    plt.legend([tree_plot, log_plot], ["Decision Tree", "Logistic Regression"], loc=4)

    plt.savefig('learningcurve.pdf')
    #plt.show()

    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    return stats
Пример #6
0
    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda = 0.00000001)
    logregModel.fit(X,y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

    # Configure the plot display
    plt.xlabel('Exam 1 Score')
    plt.ylabel('Exam 2 Score')

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda=0.0001)
    logregModel.fit(X, y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])
    print Z

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

    # Configure the plot display
    plt.xlabel('Exam 1 Score')
    plt.ylabel('Exam 2 Score')

    plt.xlim(xx.min(), xx.max())
Пример #8
0
    y = data[:, 2]

    # Standardize the data
    mean = PX.mean(axis=0)
    std = PX.std(axis=0)
    PX = (PX - mean) / std

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
    y_min, y_max = PX[:, 1].min() - .5, PX[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    allPoints = np.c_[xx.ravel(), yy.ravel()]
    allPoints = mapFeature(allPoints[:, 0], allPoints[:, 1])
    Z = logregModel.predict(allPoints)

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot the training points
    plt.scatter(PX[:, 0], PX[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

    # Configure the plot display
    plt.xlabel('Microchip Test 1')
    plt.ylabel('Microchip Test 2')

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
Пример #9
0
import time

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    kf = KFold(n_splits=5)

    start = time.time()
    for (i, (train, test)) in enumerate(kf.split(X_train), start=1):
        clf = LogisticRegression()
        clf.fit(X_train[train], y_train[train])
        y_predict = clf.predict(X_train[test])
        y_test = y_train[test]
        print("Fold %d" % i)
        print("正解率: %f" % accuracy_score(y_test, y_predict))
        print("適合率: %f" % precision_score(y_test, y_predict))
        print("再現率: %f" % recall_score(y_test, y_predict))
        print("F1スコア: %f" % f1_score(y_test, y_predict))
        print("")
    elapsed_time = time.time() - start
    print(str(elapsed_time) + "[sec]")
Пример #10
0
#coding:utf-8

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    clf = LogisticRegression("logreg")
    y_predict = clf.predict(X_train)
    print("正解率: %f" % accuracy_score(y_train, y_predict))
    print("適合率: %f" % precision_score(y_train, y_predict))
    print("再現率: %f" % recall_score(y_train, y_predict))
    print("F1スコア: %f" % f1_score(y_train, y_predict))
Пример #11
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***

    def image_path(path):
        return path[:-3] + "png"

    # Part (a): Train and test on true labels
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    x_train, t_train = util.load_dataset(train_path,
                                         label_col="t",
                                         add_intercept=True)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col="t",
                                       add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_true, prob_test)
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_true))

    # Part (b): Train on y-labels and test on true labels
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    x_train, y_train = util.load_dataset(train_path,
                                         label_col="y",
                                         add_intercept=True)
    x_test, y_test = util.load_dataset(test_path,
                                       label_col="y",
                                       add_intercept=True)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_naive, prob_test)

    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_naive))
    # Part (f): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to output_path_adjusted
    # Estimate alpha
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col="y",
                                     add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])  # Mean over positive y samples.
    # Adjustment
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    np.savetxt(output_path_adjusted, pt_test)
    # Plot
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_adjusted),
              correction=alpha)
Пример #12
0
plt.imshow(train_set_x_orig[index])
plt.show()
print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") +  "' picture.")
'''

# Flatten the images
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

# Normalise image values
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

# Create model instance
model = LogisticRegression()

# Fit model to the data
model.fit(train_set_x, train_set_y)

# Train the model
model.train(2400, verbose=True)

# Predict values
predictions = model.predict(test_set_x)

# Check accuracy
model.print_accuracy(predictions, test_set_y)

# Plot training loss
model.plot_cost()
Пример #13
0
#coding:utf-8

import sys
from sklearn.externals import joblib
from question71 import makeStoplist
from question72 import extractFeaturesFromString
from logreg import LogisticRegression

if __name__ == "__main__":

    vectorizer = joblib.load("tfidf.vec")
    clf = LogisticRegression("logreg")
    stoplist = makeStoplist()
    while True:
        test = input()
        test = extractFeaturesFromString(test, stoplist)
        print(["-1",
               "+1"][clf.predict(vectorizer.transform([" ".join(test)]))[0]])
        sys.stdout.flush()
Пример #14
0
from sklearn.metrics import precision_score, recall_score
from logreg import LogisticRegression
import matplotlib.pyplot as plt

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    clf = LogisticRegression("logreg")

    #thresholdに応じたprecisionとrecallの変化をプロット
    threshold_list = [i * 0.05 for i in range(20)]
    precision_list = []
    recall_list = []
    for threshold in threshold_list:
        y_predict = clf.predict(X_train, threshold)
        precision_list.append(precision_score(y_train, y_predict))
        recall_list.append(recall_score(y_train, y_predict))

    plt.plot(threshold_list, precision_list, label="precision", color="red")
    plt.plot(threshold_list, recall_list, label="recall", color="blue")

    plt.xlabel("threshold")
    plt.ylabel("rate")
    plt.xlim(0.0, 1.0)
    plt.ylim(0, 1)
    plt.title("logistic_regresssion")
    plt.legend(loc=3)
    plt.show()

    #precision-recall curveをプロット
Пример #15
0
def main(train_path, validation_path, save_path):
    """Problem 2: Logistic regression for imbalanced labels.

    Run under the following conditions:
        1. naive logistic regression
        2. upsampling minority class

    Args:
        train_path: Path to CSV file containing training set.
        validation_path: Path to CSV file containing validation set.
        save_path: Path to save predictions.
    """
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_upsampling = save_path.replace(WILDCARD, 'upsampling')

    # *** START CODE HERE ***
    # Part (b): Vanilla logistic regression
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    print("Vanilla Logistic Regression:")
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_naive, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))

    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation")
    # Part (d): Upsampling minority class
    # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt()
    # Repeat minority examples 1 / kappa times
    num_add = int(1 / kappa) - 1

    x_train = np.concatenate(
        (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)),
        axis=0)
    y_train = np.concatenate(
        (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_upsampling, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))
    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta,
              output_path_upsampling[:-4] + "validation")
Пример #16
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)

    model_true = LogisticRegression()
    model_true.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)

    util.plot(x_test, y_test, model_true.theta, 'plot_5a.png')

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model_true.predict(x_test))

    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)

    model_naive = LogisticRegression()
    model_naive.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)
    util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png')

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model_naive.predict(x_test))

    # Part (f): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path,
                                         label_col='t',
                                         add_intercept=True)

    x_index = np.where(y_valid == 1)

    alpha = 1 / len(y_valid[y_valid == 1]) * np.sum(
        model_naive.predict((x_valid[x_index])))

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)

    util.plot(x_test,
              y_test,
              model_naive.theta,
              'plot_5f.png',
              correction=alpha)

    np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
Пример #17
0
    PX = data[:, 0:2]
    y = data[:, 2]
    
    # Standardize the data
    mean = PX.mean(axis=0)
    std = PX.std(axis=0)
    PX = (PX - mean) / std
    
    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
    y_min, y_max = PX[:, 1].min() - .5, PX[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    allPoints = np.c_[xx.ravel(), yy.ravel()]
    allPoints = mapFeature(allPoints[:,0], allPoints[:,1])
    Z = logregModel.predict(allPoints)

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot the training points
    plt.scatter(PX[:, 0], PX[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
    
    # Configure the plot display
    plt.xlabel('Microchip Test 1')
    plt.ylabel('Microchip Test 2')

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
Пример #18
0
def evaluatePerformance(numTrials = 1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation
    
    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy
      
    ** Note that your implementation must follow this API**
    '''
    
       # Xtrain = X[1:101,:]  # train on first 100 instances
       #  Xtest = X[101:,:]
       #  ytrain = y[1:101,:]  # test on remaining instances
       #  ytest = y[101:,:]

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # shuffle the data
    idx = np.arange(n)
    np.random.seed(13)
    # number of folds
    k = 10 
    # creates an array of numbers that correspond to the start / end points of each fold in the case for hw from 0 -266  it should return 0 26 ...267
    fold_index = n/k 
    index_arrayX =  [i*fold_index for i in range(k)]
    index_arrayX = np.append(index_arrayX,n)
    index_arrayY = [i*fold_index for i in range(k)] 
    index_arrayY = np.append(index_arrayX,n)

    stddevLogisticRegressionAccuracy = 0
    meanDecisionTreeAccuracy = 0
    meanLogisticRegressionAccuracy = 0 
    stddevDecisionTreeAccuracy = 0
    # an array to store all of the learning accuracies  where the #rows = k*numTrial and # columns is each percentage of the data 
    log_learning = np.matrix(np.zeros((numTrials*k,9)))
    tree_learning = np.matrix(np.zeros((numTrials*k,9)))
    #index for learning 
    ll =0 
    #accuracy vars 
    log_a = 0
    tree_a =0

    # making decision tree object and a logistic regression object 

    clf = tree.DecisionTreeClassifier()
    lr = LogisticRegression(alpha = 0.0000001, regLambda=0.001, epsilon=0.0001, maxNumIters = 10000)

    #test_instance = 1
    #start_time = time.time()
    # ~~~~~~~~~~~main loop ~~~~~~~~~~~~~~~~~
    for i in xrange (numTrials): 
        #shuffle data after each cross validation 
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        for j in xrange(k): 
          # seperate test data from train data, moves test data to subsequent fold after each loop
          #print (time.time() - start_time)
          end = j+1
          Xtest = X[index_arrayX[j]:index_arrayX[end],:]
          ytest = y[index_arrayY[j]:index_arrayX[end],:]
          Xtrain = X[0:index_arrayX[j],:]
          ytrain = y[0:index_arrayY[j],:]
          Xtrain = np.append(Xtrain, X[index_arrayX[j+1]:n,:],axis =0)
          ytrain = np.append(ytrain, y[index_arrayY[j+1]:n,:],axis =0)
          size_n,size_d = Xtrain.shape
          #size of 10% blocks 
          train_percentage = size_n/10
          for l in xrange(1,10):
            #train / find accuracy over 10% then 20% ect until loop exits 
            clf = clf.fit(Xtrain[0:train_percentage*l,:],ytrain[0:train_percentage*l,:])
            treey_pred = clf.predict(Xtest[0:train_percentage*l,:])
            lr.fit(Xtrain[0:train_percentage*l,:], ytrain[0:train_percentage*l,:])
            logy_pred = lr.predict(Xtest[0:train_percentage*l,:]) 
            # fill in accuracies into accuracy matrix  
            log_a =  accuracy_score(ytest[0:train_percentage*l,:],logy_pred) + log_a
            tree_a = accuracy_score(ytest[0:train_percentage*l,:],treey_pred) + tree_a
            log_learning[ll,(l-1)] = log_a
            tree_learning[ll,(l-1)] = tree_a
            ll+1
    tree_acc = 0
    log_acc = 0 
    for o in xrange(9):
      #summing the accuracies for each percentage then dviding by fold*trials * percentages
      meanDecisionTreeAccuracy = (np.sum(tree_learning[:,o])/(9*k*numTrials)) + meanDecisionTreeAccuracy
      meanLogisticRegressionAccuracy = (np.sum(log_learning[:,o])/(9*k*numTrials)) + meanLogisticRegressionAccuracy 

    #finding total mean accuracy over all percentages as well as standard deviations over (k*numTrial) trials
    meanDecisionTreeAccuracy = meanDecisionTreeAccuracy/(9)
    meanLogisticRegressionAccuracy = meanLogisticRegressionAccuracy /(9)
    stddevDecisionTreeAccuracy = np.std(tree_learning)/(k*numTrials)
    stddevLogisticRegressionAccuracy = np.std(log_learning)/(k*numTrials)


    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    #end_time = time.time() 
    plot_log= np.array(np.zeros((9,1)))
    plot_tree =np.array(np.zeros((9,1)))
    #putting the mean accuracies for each perctage block into an array
    for q in xrange(9):
      plot_log[q] = np.sum(log_learning[:,q])/(9*k*numTrials)
      plot_tree[q] = np.sum(tree_learning[:,q])/(9*k*numTrials)
    percent_array = [10,20,30,40,50,60,70,80,90]

    plt.figure(1)
    plt.clf()
    plt.title("Learning Curve")
    plt.xlabel("Percentage")
    plt.ylabel("Accuracy")
    plt.axis([0,100, .6,.8])
    plt.plot(percent_array,plot_log, 'rx', label='Logistic Regression')
    plt.hold 
    plt.plot(percent_array,plot_tree, 'bx',label ='Decision Tree')
    plt.legend(loc='lower right')
    plt.savefig('learningcurve.png')
    #plt.show()
    

    return stats
Пример #19
0
    if args.test:
        test_file = args.test
        test = pd.read_csv(test_file)

    if test_file is None:
        print("Splitting train to accomodate for test set.")
        train, test = train_test_split(train, test_size=0.2)

    train_Y = train['labels'].values
    train_X = train.drop(['labels'], axis=1).values

    test_Y = test['labels'].values
    test_X = test.drop(['labels'], axis=1).values

    print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
    logreg = LogisticRegression(learning_rate=lr,
                                epochs=epochs,
                                initialiser=init,
                                verbose=verbose)
    logreg.fit(train_X, train_Y)
    predictions = logreg.predict(test_X)

    if args.output == ".":
        args.output = os.getcwd()
    with open(args.output + "/classification_report.txt", 'w') as f:
        f.write(str(classification_report(test_Y, predictions)))

    test['predictions'] = predictions
    test.to_csv(args.output + "/predictions.csv")
Пример #20
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='t')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='t')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("posonly_all_observed.png")
    plt.show()

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='y')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='y')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("naive_training_partial.png")
    plt.show()
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    clf = LogisticRegression()
    clf.fit(x_train, y_train)

    #decition
    y_pred = clf.predict(x_valid)
    print(y_pred)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plt.show()