Exemplo n.º 1
0
def compare_penalty():

    penalties = (-8, -6, -4, -2, 0, 2)

    # Instantiate the Logistic Regression model with no regularization
    model_unreg = LogisticRegression(verbose=True)

    # Fit the model.
    model_unreg.fit(x_tr, y_tr, lr=10**(-4), maxit=ITERATIONS, tolerance=None)

    # Predict results
    pred_tr_unreg = model_unreg.predict(x_tr)
    pred_ts_unreg = model_unreg.predict(x_ts)

    # Evaluate
    train_scores = BinaryScorer(y_tr,
                                pred_tr_unreg,
                                description='Training',
                                positive_class=1,
                                negative_class=0)
    test_scores = BinaryScorer(y_ts,
                               pred_ts_unreg,
                               description='Testing',
                               positive_class=1,
                               negative_class=0)

    accuracy_tr_unreg = train_scores.accuracy()
    accuracy_ts_unreg = test_scores.accuracy()

    accuracies_tr_reg = []
    accuracies_ts_reg = []

    # Evaluate and plot the model with different learning rates
    for alpha in penalties:

        # Instantiate the Logistic Regression model with regularization
        model_reg = LogisticRegression(l2penalty=2**alpha, verbose=True)

        # Fit the model.
        model_reg.fit(x_tr,
                      y_tr,
                      lr=10**(-4),
                      maxit=ITERATIONS,
                      tolerance=None)

        # Predict results
        pred_tr_reg = model_reg.predict(x_tr)
        pred_ts_reg = model_reg.predict(x_ts)

        # Evaluate
        train_scores = BinaryScorer(y_tr,
                                    pred_tr_reg,
                                    description='Training',
                                    positive_class=1,
                                    negative_class=0)
        test_scores = BinaryScorer(y_ts,
                                   pred_ts_reg,
                                   description='Testing',
                                   positive_class=1,
                                   negative_class=0)

        accuracies_tr_reg.append(train_scores.accuracy())
        accuracies_ts_reg.append(test_scores.accuracy())

    fig = plt.figure()

    plt.plot(penalties,
             accuracies_tr_reg,
             'o-',
             label='Train w/ reg',
             color='blue')
    plt.plot(penalties,
             accuracies_ts_reg,
             'o-',
             label='Test w/ reg',
             color='purple')
    plt.plot([penalties[0], penalties[len(penalties) - 1]],
             [accuracy_tr_unreg, accuracy_tr_unreg],
             '-',
             label='Train w/o reg',
             color='red')
    plt.plot([penalties[0], penalties[len(penalties) - 1]],
             [accuracy_ts_unreg, accuracy_ts_unreg],
             '-',
             label='Test w/o reg',
             color='orange')

    plt.title('Regularization comparison, lr = 10^-4')
    plt.ylabel('Accuracy')
    plt.xlabel('Values of k, with alpha = 2^k')
    plt.grid(True, linestyle=':')
    plt.legend(loc="upper right")

    plt.show()
    fig.savefig('logistic_penalty_comp.png', dpi=300)
Exemplo n.º 2
0
    tmp_rate = 0.
    for i, test_X in enumerate(k_X_set):
        test_y = k_y_set[i]
        #generate train_X
        train_X = []
        train_y = []
        for j in xrange(len(k_X_set)):
            if j == i:
                continue
            train_X.append(k_X_set[j])
            train_y.append(k_y_set[j])
        train_X = np.vstack(train_X)
        train_y = np.hstack(train_y)
        lr = LogisticRegression(beta=beta)
        lr.fit(train_X, train_y)
        exp_y = lr.predict(test_X)
        tmp_rate += 1.0 - (exp_y ^ test_y).sum() / float(len(test_y))
    tmp_rate /= cv_k
    if tmp_rate > best_rate:
        best_initial_beta = beta
        best_rate = tmp_rate

print best_initial_beta
print best_rate

lr = LogisticRegression(beta=best_initial_beta)
X = []
y = []
for i in xrange(cv_k - 1):
    X.append(k_X_set[i])
    y.append(k_y_set[i])
Exemplo n.º 3
0
LogisticRegression1 test runner

Chapter "Logistic Regression"
"""

# imports
import numpy as np
from lr import LogisticRegression

# load data
data = np.genfromtxt('iris2.tsv', dtype=[('X', float, 4), ('y', int)])

# learn model
clr = LogisticRegression()
clr.fit(data['X'], data['y'])

# test model
predict_y = clr.predict(data['X'])

# print results
for i in range(0, 100, 10):
    print(i, data['y'][i], predict_y[i])

# print accuracy
print("Accuracy =", end=' ')
print(np.sum(data['y'] == predict_y, dtype=float) / predict_y.shape[0])

# print parameters
print("coef =", clr.coef_)
print("intercept_ =", clr.intercept_)
Exemplo n.º 4
0
def main():

    # load training data
    print("####################################################")
    print("Data preprocessing...")
    filename = 'data/train.csv'
    Dict = transform(filename)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        Dict['data'],
        Dict['target'],
        test_size=test_size,
        random_state=random_state)

    train = pd.concat([X_train, y_train], axis=1)
    print("Cross validate on 10% of the whole dataset, training data shape: ",
          train.shape)
    thresh = nan_thresh * train.shape[1]
    train = train.dropna(thresh=thresh)
    #train = train.dropna(subset = ['Income','HouseholdStatus','EducationLevel'])

    train = fill_missing(train, strategy, isClassified)

    y = train['Happy']
    df = train.drop('Happy', 1)
    numeric_col = df[['YOB', 'votes']]
    #numeric_col = fill_missing(numeric_col,'mean',False)
    normalized = pd.DataFrame(normalize(numeric_col))
    categorical_col = df[[
        'Income', 'HouseholdStatus', 'EducationLevel', 'Party'
    ]]
    #categorical_col = fill_missing(categorical_col,'median',False)
    binary_col = df.drop(['UserID','YOB','Income','HouseholdStatus',\
                        'EducationLevel','Party','votes'],axis = 1)

    encoder = OneHotEncoder()
    #one_hot = pd.get_dummies(categorical_col)
    one_hot = pd.DataFrame(encoder.fit_transform(categorical_col).toarray())

    #X = pd.concat([normalized,one_hot,binary_col],axis = 1)

    X = np.hstack((normalized, one_hot, binary_col))
    #X = np.hstack((one_hot,binary_col))

    X_test = fill_missing(X_test, strategy, isClassified)
    UserID = X_test['UserID']

    categorical_col = X_test[[
        'Income', 'HouseholdStatus', 'EducationLevel', 'Party'
    ]]
    numeric_col = X_test[['YOB', 'votes']]
    binary_col = X_test.drop(['UserID','YOB','Income','HouseholdStatus',\
                        'EducationLevel','Party','votes'],axis = 1)
    one_hot = encoder.fit_transform(categorical_col).toarray()
    normalized = normalize(numeric_col)
    #X = np.hstack((one_hot,binary_col))

    X_test = np.hstack((normalized, one_hot, binary_col))
    X_a = X
    X_b = X_test
    y_a = y
    y_b = y_test
    print("Training data shape (After drop NaN and one hot encoding): ",
          X_a.shape)
    print("Cross validation data shape (after one hot encoding): ", X_b.shape)
    #X_a, X_b, y_a, y_b = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0)
    print("####################################################")
    ### use the logistic regression
    print('Train the logistic regression classifier...')
    start = time.time()
    clf_lr = LogisticRegression()
    clf_lr = clf_lr.fit(X_a, y_a)
    clf_lr_predict = clf_lr.predict(X_b)
    end = time.time()
    print("Accuracy of logistic regression is ",
          (y_b == clf_lr_predict).sum() / y_b.shape[0])
    print("Time of of logistic regression is %.4fs." % (end - start))

    start = time.time()
    lr_model = linear_model.LogisticRegression()
    lr_model = lr_model.fit(X_a, y_a)
    lr_model_predict = lr_model.predict(X_b)
    print("Scikit learn n_iter_: ", lr_model.n_iter_)
    end = time.time()
    print("Accuracy of scikit-learn logistic Regression is ",
          (y_b == lr_model_predict).sum() / y_b.shape[0])
    print("Time of of scikit-learn logistic regression is %.4fs." %
          (end - start))
    print(" ")

    ### use the naive bayes
    print('Train the naive bayes classifier...')
    start = time.time()
    clf_nb = NaiveBayes()
    clf_nb = clf_nb.fit(X_a, y_a)
    clf_nb_predict = clf_nb.predict(X_b)
    end = time.time()
    print("Accuracy of naive bayes  is ",
          (y_b == clf_nb_predict).sum() / y_b.shape[0])
    print("Time of of naive bayes is %.4fs." % (end - start))

    start = time.time()
    nb_model = BernoulliNB()
    nb_model = nb_model.fit(X_a, y_a)
    nb_model_predict = nb_model.predict(X_b)
    end = time.time()
    print("Accuracy of scikit-learn Bernoulli NB is ",
          (y_b == nb_model_predict).sum() / y_b.shape[0])
    print("Time of of scikit-learn Bernoulli NB is %.4fs." % (end - start))
    print(" ")

    ## use the svm
    print('Train the SVM classifier...')
    start = time.time()
    svm_model = SVC(kernel=kernel)
    svm_model = svm_model.fit(X_a, y_a)
    svm_model_predict = svm_model.predict(X_b)
    end = time.time()

    print("Accuracy of scikit-learn SVM is ",
          (y_b == svm_model_predict).sum() / y_b.shape[0])
    print("Time of of scikit-learn SVM is %.4fs." % (end - start))
    print(" ")

    ## use the random forest
    print('Train the random forest classifier...')
    start = time.time()
    rf_model = RandomForestClassifier(n_estimators=n_estimators,
                                      random_state=random_state)
    rf_model = rf_model.fit(X_a, y_a)
    rf_model_predict = rf_model.predict(X_b)
    end = time.time()

    print("Accuracy of scikit-learn RF is ",
          (y_b == rf_model_predict).sum() / y_b.shape[0])
    print("Time of of scikit-learn RF is %.4fs." % (end - start))
    print(" ")

    print("####################################################")
    print("Start predicting test dataset...")
    filename_test = './data/test.csv'
    Dict = transform(filename_test)
    X_test_data = Dict['data']

    test = fill_missing(X_test_data, strategy, isClassified)
    UserID = test['UserID'].astype(int)

    categorical_col = test[[
        'Income', 'HouseholdStatus', 'EducationLevel', 'Party'
    ]]
    numeric_col = test[['YOB', 'votes']]
    binary_col = test.drop(['UserID','YOB','Income','HouseholdStatus',\
                        'EducationLevel','Party','votes'],axis = 1)
    one_hot = encoder.fit_transform(categorical_col).toarray()
    normalized = normalize(numeric_col)
    #X_test = np.hstack((one_hot,binary_col))

    X_test = np.hstack((normalized, one_hot, binary_col))
    print("Test data shape (after one hot encoding): ", X_test.shape)

    # clf_lr predictions
    lr_pred = clf_lr.predict(X_test)
    lr_Happy = pd.DataFrame({'Happy': lr_pred})
    predsLR = pd.concat([UserID, lr_Happy], axis=1)
    predsLR.to_csv('./predictions/lr_predictions.csv', index=False)

    # clf_nb predictions
    nb_pred = clf_nb.predict(X_test)
    nb_Happy = pd.DataFrame({'Happy': nb_pred})
    predsLR = pd.concat([UserID, nb_Happy], axis=1)
    predsLR.to_csv('./predictions/nb_predictions.csv', index=False)

    # rf predictions
    rf_pred = rf_model.predict(X_test)
    rf_Happy = pd.DataFrame({'Happy': rf_pred})
    predsLR = pd.concat([UserID, rf_Happy], axis=1)
    predsLR.to_csv('./predictions/rf_predictions.csv', index=False)

    # svm predictions
    svm_pred = svm_model.predict(X_test)
    svm_Happy = pd.DataFrame({'Happy': svm_pred})
    predsLR = pd.concat([UserID, svm_Happy], axis=1)
    predsLR.to_csv('./predictions/svm_predictions.csv', index=False)
    print("Prediction saved.")