def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy, legend_labels=data.target_names)
def digit_recognition(): print('\nDigit recognition using Logistic Regression\n') print('Initiating Data Load...') digits = datasets.load_digits() X, y = digits.data, digits.target pca = PCA() X = pca.transform(X, num_components=23) y = one_hot_encode(y) size = len(X) indices = list(range(size)) np.random.shuffle(indices) X, y = np.array([X[idx] for idx in indices ]), np.array([y[idx] for idx in indices]) train_size = int(0.8 * len(X)) X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] print('Constructing classifier...') size = (X_train.shape[-1], y_train.shape[-1]) classifier = LogisticRegression(size) classifier.fit(X_train, y_train) print('Generating test predictions...') predictions = classifier.predict(X) accuracy = np.sum( [all(y_true == y_pred) for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100. print("Accuracy = {:.2f}%".format(accuracy))
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) clf1 = linear_model.LogisticRegression() clf1 = LogisticRegression() clf1.fit(X_train, y_train) y_pred = clf1.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("sklearn lr Accuracy:", accuracy) clf2 = LogisticRegression() clf2.fit(X_train, y_train) y_pred = clf2.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Our lr Accuracy:", accuracy)
def iris_classification(): print('\nIris classification using Logistic Regression\n') print('Initiating Data Load...') iris = datasets.load_iris() # X, y = iris.data, iris.target # y = one_hot_encode(y) X, y = iris.data[iris.target != 2], iris.target[iris.target != 2] y = y.reshape(y.shape[0], 1) size = len(X) indices = list(range(size)) np.random.shuffle(indices) X, y = np.array([X[idx] for idx in indices ]), np.array([y[idx] for idx in indices]) train_size = int(0.8 * len(X)) X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] print('Data load complete!') print('Constructing classifier...') size = (X_train.shape[-1], y_train.shape[-1]) classifier = LogisticRegression(size) classifier.fit(X_train, y_train) print('Generating test predictions...') predictions = classifier.predict(X) accuracy = np.sum( [all(y_true == y_pred) for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100. print("Accuracy = {:.2f}%".format(accuracy))
def test_logistic_regression(): X = np.random.normal(size=(100, 2)) y = np.where(X[:, 0] > 0.5, 1, 0).reshape(-1, 1) lr = LogisticRegression() lr.fit(X, y) pred = 1 if lr.predict(X)[-1] > 0.5 else 0 assert pytest.approx(pred) == y[-1]
def main(): # Get training matrices for logistic regression model x, y = get_train_matrices() # Create instance of LogisticRegression with the training matrices logistic_regression = LogisticRegression(x, y) # Fit with learning rate, no of iterations and regularization(L2) parameter logistic_regression.fit(0.01, 100000, 0) # Print weights and biases and the plot and also print the performance estimators of the model print("So, the weights and biases become:\nWeights:\n {}\nBiases:\n {}" .format(logistic_regression.w, logistic_regression.c)) # Validate the model by printing the performance metrics logistic_regression.validate() # Graph the curve of cost vs no of epochs logistic_regression.graph_cost_vs_epochs() # Predict for the input data in test folder and save as output.csv in test folder x_test = pd.read_csv('test/input.csv').values[:, 1:] y_test = logistic_regression.predict(x_test) df_predict = pd.DataFrame({'y': y_test.reshape(-1)}) df_predict.to_csv('test/output.csv')
def test_integ_fit(): test_x = [np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])] test_y = [np.array([1, 0, 1])] expected = [np.array([0.01328192, 0.06222676, 0.1111716])] lr_model = LogisticRegression() for idx in range(len(test_x)): lr_model.fit(test_x[idx], test_y[idx]) assert pytest.approx(expected[idx], 1e-06) == lr_model.parameters
def train(): fname = sys.argv[1] output_fname = sys.argv[2] X, y = get_data(data=read_train_csv(fname)) model = LogisticRegression(iteration=30000) model.fit(X, y) model.save(output_fname)
def runML(meth, itrs, data_train, data_test, labels_train, labels_test): print meth,datetime.now().time() model = LogisticRegression(method=meth,max_iters=itrs) model.fit(data_train, labels_train) print datetime.now().time() prediction = model.predict(data_test) tagscores = LogisticRegression.tagAccuracy(labels_test, prediction) score = np.mean(tagscores) print " score tags: mean: {}, max: {}, min: {}".format(score,max(tagscores),min(tagscores)) print " error rate: {}".format(1 - score) print datetime.now().time()
def standard_lr(x_train, y_train, x_valid, y_valid): from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l2', max_iter=500, solver='sag', multi_class='ovr') lr.fit(x_train, y_train) pre = lr.predict(x_valid) correct = 0 for i in range(len(y_valid)): if pre[i] == y_valid[i]: correct += 1 print correct*1.0/len(y_valid)
def lambdaError(lam, folds): average = 0 logreg = LogisticRegression(lam) for i in range(0, 5): leave_out_data, training_data = utils.partition_cross_validation_fold( folds, i) logreg.fit(training_data[0], training_data[1]) reg_pred = logreg.predict(leave_out_data[0]) reg_err = utils.classification_error(reg_pred, leave_out_data[1]) average = average + reg_err average = average / 5 return average
def repeat(train_X, train_Y, num, feature_list, intercept): for i in train_X.index.values: for col in train_X.columns: if pd.isna(train_X.loc[i][col]): train_X.loc[i, col] = train_X[col].mean() models = [] for i in range(3): lr = LogisticRegression(fit_intercept=intercept) lr.fit(train_X[feature_list], train_Y.values[:, i]) models.append(lr) model = models[num] return modelInfo(model, train_X, train_Y.values[:, num], feature_list, intercept)
def p02cde(train_path, valid_path, test_path, pred_path): """Logistic regression with Newton's Method Args: train_path: Path to CSV file containing dataset for training. validation_path: Path to CSV file containing dataset for evaluation. test_path: Path to CSV file containing dataset for testing. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, "c") pred_path_d = pred_path.replace(WILDCARD, "d") pred_path_e = pred_path.replace(WILDCARD, "e") # Part (c) # Train classifier x_train, y_train = utils.load_dataset(train_path, label_col="t", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) # Validate classifier x_test, y_test = utils.load_dataset(valid_path, label_col="t", add_intercept=True) t_pred = model.predict(x_test) utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_c)) np.savetxt(pred_path_c, t_pred) # Part (d) x_train, y_train = utils.load_dataset(test_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) # Validate classifier x_test, y_test = utils.load_dataset(test_path, label_col="t", add_intercept=True) y_pred = model.predict(x_test) utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_d)) np.savetxt(pred_path_d, y_pred) # Part (e) find corrections x_val, y_val = utils.load_dataset(valid_path, label_col="y", add_intercept=True) x_in_V = [x_train[i] for i in len(x_train) if y_train == 1] h = model.predict(x_in_V) alpha = np.mean(h)
def test_fit_functional(): import sklearn.model_selection import sklearn.datasets import numpy as np from logistic_regression import LogisticRegression, accuracy X = np.zeros((1000, 3), dtype=np.float32) X[:, -1] = 1 features, targets = sklearn.datasets.make_blobs(1000, 2, 2, cluster_std=1, random_state=1234) X[:, [0, 1]] = features y = targets[:, np.newaxis] X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y) model = LogisticRegression(input_dimensions=2) train_xent, val_xent = model.fit(X_train, y_train, X_val, y_val, num_epochs=20, batch_size=4, alpha=0.1, _lambda=0.0) predictions = model.predict(X_val) assert accuracy(predictions, y_val) >= 0.65 assert accuracy(predictions, y_val) >= 0.90 assert accuracy(predictions, y_val) >= 0.99
def test_fit_functional(): import sklearn.model_selection import numpy as np from logistic_regression import LogisticRegression, accuracy X = np.zeros((900, 3), dtype=np.float32) num_samples = 30 xx = np.linspace(-5, 5, num_samples) XX, YY = np.meshgrid(xx, xx) X[:, 0] = XX.flatten() X[:, 1] = YY.flatten() X[:, -1] = 1 # a column of 1's for the bias trick Z = 0.1 * XX + 0.2 * YY + 0.4 y = Z.reshape(-1, 1) X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y) model = LogisticRegression(input_dimensions=2) train_xent, val_xent = model.fit(X_train, y_train, X_val, y_val, num_epochs=20, batch_size=4, alpha=0.1, _lambda=0.0) predictions = model.predict(X_val) assert accuracy(predictions, y_val) >= 0.65 assert accuracy(predictions, y_val) >= 0.90 assert accuracy(predictions, y_val) >= 0.99
def p01b(train_path, eval_path, pred_path): """Logistic regression with Newton's Method Args: train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. pred_path: Path to save predictions. """ # Train classifier x_train, y_train = utils.load_dataset(train_path, add_intercept=True) model = LogisticRegression(eps=1e-5) model.fit(x_train, y_train) # Validate classifier x_val, y_val = utils.load_dataset(eval_path, add_intercept=True) y_pred = model.predict(x_val) utils.plot(x_val, y_val, model.theta, "{}.png".format(pred_path)) np.savetxt(pred_path, y_pred)
def fitting(): data = pd.read_csv('student_score.txt', names=['Exam1', 'Exam2', 'admission']) x = data[['Exam1', 'Exam2']] y = data['admission'] print(x.mean()) print(x.max() - x.min()) x = (x - x.mean()) / (x.max() - x.min()) alpha = 10 max_iter = 150 model = LogisticRegression(alpha, max_iter) loss, _ = model.fit(x, y) p = model.predict( np.array([[ 1, (45.0 - 65.644274) / 69.769035, (85.0 - 66.221998) / 68.266173 ]]), False) print('Predict %.3f when Exam1 euqals 45 and Exam2 equals 85' % p) plt.subplot(2, 1, 1) plt.plot(np.arange(1, max_iter + 1), loss) plt.title('Loss Curve') plt.subplot(2, 1, 2) negative = data[data['admission'] == 0] positive = data[data['admission'] == 1] plt.plot(negative['Exam1'], negative['Exam2'], 'yo') plt.plot(positive['Exam1'], positive['Exam2'], 'k+') print(model.w) bx = data['Exam1'] by = (-68.266173 / model.w[2]) * (( (bx - 65.644274) / 69.769035) * model.w[1] + model.w[0]) + 66.221998 x = data[['Exam1', 'Exam2']] x = (x - x.mean()) / (x.max() - x.min()) p = [1 if i >= 0.5 else 0 for i in model.predict(x)] tp = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 1]) tn = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 0]) fp = sum([1.0 for vp, vy in zip(p, y) if vp == 1 and vy == 0]) fn = sum([1.0 for vp, vy in zip(p, y) if vp == 0 and vy == 1]) print(tp, tn, fp, fn) print('Accurancy %.2f' % ((tp + tn) / (tp + tn + fp + fn))) print('Precision %.2f' % (tp / (tp + fp))) print('Recall %.2f' % (tp / (tp + fn))) plt.plot(bx, by) plt.show()
def test_logistic_regression(): iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) X_train_01_subset = X_train[(y_train == 0) | (y_train == 1)] y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)] lr = LogisticRegression(alpha=0.05, n_iter=1000, random_state=1) lr.fit(X_train_01_subset, y_train_01_subset) lr.plot_decision_regions(X=X_train_01_subset, y=y_train_01_subset, classifier=lr) plt.xlabel('Petal Length') plt.ylabel('Petal Width') plt.legend(loc='upper left') plt.show()
def test_passing(): X, Y = datasets.make_classification(n_samples=100, random_state=42) X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=0.3, random_state=42) LR = LogisticRegression(20, seed=42) LR.fit(X_train, Y_train) Y_pred = LR.predict(X_test) Y_pred = (Y_pred > 0.5).clone().detach().type(torch.float32) Y_test = torch.tensor(Y_test, dtype=torch.float32) Y_test = torch.reshape(Y_test, (-1, 1)) accuracy = 1 - torch.mean(torch.abs(Y_pred - Y_test)).item() assert abs(accuracy - 0.9666) < 0.01
def cross_validate_logistic(X, y, alpha, num_iterations): num_data_points = len(y) one_fifth = math.ceil(num_data_points / 5) initial_theta = [] sum_accuracy = 0 for i in range(len(X.columns)): initial_theta.append(0) for i in range(5): x_valid = X.iloc[one_fifth * i:one_fifth * (i + 1)] y_valid = y.iloc[one_fifth * i:one_fifth * (i + 1)] x_train = X.drop(X.index[one_fifth * i:one_fifth * (i + 1)]) y_train = y.drop(y.index[one_fifth * i:one_fifth * (i + 1)]) x_train, y_train, x_valid, y_valid = np.array(x_train), np.array( y_train), np.array(x_valid), np.array(y_valid) LR = LogisticRegression(initial_theta) LR.fit(x_train, y_train, alpha, num_iterations) prediction = LR.predict(x_valid) sum_accuracy += evaluate_acc(y_valid, prediction) return sum_accuracy / 5
def sgd(mus, rates, decays, data, labels, data_train, labels_train, data_valid, labels_valid, data_test, labels_test): print "starting grid search for SGD" validation_results = {} dicts = [] for mu in mus: for rate in rates: for decay in decays: print "trying mu={} rate={} decay={}".format(mu, rate, decay) model = LogisticRegression(method="sgd", mu=mu, rate=rate, decay=decay, random_state=0) model.fit(data_train, labels_train) prediction = model.predict(data_valid) score = accuracy_score(labels_valid, prediction) validation_results[(mu, rate, decay)] = score print " score: {}".format(score) print " error rate: {}".format(1 - score) d = dict(method="sgd", mu=mu, rate=rate, decay=decay, score=score, lcl=model.lcl_, rlcl=model.rlcl_, test=False) dicts.append(d) print "evaluating on test set" # get hyperparameters for highest accuracy on validation set mu, rate, decay = max(validation_results, key=validation_results.get) print "Using mu={} rate={} decay={}".format(mu, rate, decay) # train on entire train set and predict on test set model = LogisticRegression(method="sgd", mu=mu, rate=rate, decay=decay, random_state=0) model.fit(data, labels) prediction = model.predict(data_test) score = accuracy_score(labels_test, prediction) print "SGD test score: {}, error rate: {}".format(score, 1 - score) d = dict(method="sgd", mu=mu, rate=rate, decay=decay, score=score, lcl=model.lcl_, rlcl=model.rlcl_, test=True) dicts.append(d) return pd.DataFrame(dicts)
def test_fit(self): model = LogisticRegression(2, epochs=1, update_method=generate_dummy_update(2)) ws = [ list(w) for w in model.fit( np.array([1, 2, 3])[None].T, np.array([1, 0, 1])) ] self.assertEqual(len(ws), 3) self.assertListEqual(ws[0], ws[2]) self.assertFalse(list(ws[0]) == list(ws[1]))
def logistic_model(X, y, learning_rate, no_of_iterations, test_split_ratio=0.2): # bc = datasets.load_breast_cancer() # X, y = bc.data, bc.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_split_ratio, random_state=1234) regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000) regressor.fit(X_train, y_train) predictions = regressor.predict(X_test) senst, speci, acc = evaluate(y_test, predictions) acc = accuracy(y_test, predictions) print("LR classification accuracy:", acc) return senst, speci, acc
def validate_logistic_regression_for_wine_quality(): num_of_folds = 5 learning_rate = 0.000001 max_iterations = 100 df = pd.read_csv("../data/winequality/winequality-red.csv", sep=";") df['classified'] = [1 if x >= 6 else 0 for x in df["quality"]] fold_size = int(round(df.shape[0] / num_of_folds)) for i in range(num_of_folds): x_test = df[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ]][i * fold_size:fold_size + i * fold_size] x_train_part_1 = df[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ]][fold_size + i * fold_size:] x_train_part_2 = df[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ]][:i * fold_size] x_train = x_train_part_1.append(x_train_part_2) print(x_train) print(x_test) y_test = df[['classified']][i * fold_size:fold_size + i * fold_size] y_train_part_1 = df[['classified']][fold_size + i * fold_size:] y_train_part_2 = df[['classified']][:i * fold_size] y_train = y_train_part_1.append(y_train_part_2) print(y_train) print(y_test) model = LogisticRegression() model.fit(learning_rate, max_iterations, np.array(x_train), np.array(y_train)) y_pred = model.predict(np.array(x_test)) print(y_pred) print("score", model.score(np.array(x_test), np.array(y_test))) pass
def lbfgs(mus, data, labels, data_train, labels_train, data_valid, labels_valid, data_test, labels_test): print "starting grid search for L-BFGS" validation_results = {} dicts = [] for mu in mus: print "trying mu={}".format(mu) model = LogisticRegression(method="lbfgs", mu=mu) model.fit(data_train, labels_train) prediction = model.predict(data_valid) score = accuracy_score(labels_valid, prediction) validation_results[mu] = score print " score: {}".format(score) print " error rate: {}".format(1 - score) d = dict(method="lbfgs", mu=mu, rate=-1, decay=-1, score=score, lcl=model.lcl_, rlcl=model.rlcl_, test=False) dicts.append(d) print "evaluating on test set" # get hyperparameters for highest accuracy on validation set mu = max(validation_results, key=validation_results.get) print "Using mu of {}".format(mu) # train on entire train set and predict on test set model = LogisticRegression(method="lbfgs", mu=mu) model.fit(data, labels) prediction = model.predict(data_test) score = accuracy_score(labels_test, prediction) print "L-BFGS test score: {}, error rate: {}".format(score, 1 - score) d = dict(method="lbfgs", mu=mu, rate=-1, decay=-1, score=score, lcl=model.lcl_, rlcl=model.rlcl_, test=True) dicts.append(d) return pd.DataFrame(dicts)
def fit(self, X, y): X = np.asarray(X) y = np.asarray(y) # Convert y to {-1, 1} y = self._convert_y(y) # Initialise weights for all data points row_length = X.shape[0] self.weights = np.ones((self.n_estimators, row_length)) self.alphas = np.zeros((self.n_estimators, 1)) self.estimators = np.empty((self.n_estimators, 1), dtype=object) time_step = 0 for time_step in range(self.n_estimators): # Use a weak classifier to fit on data weak_classifier = LogisticRegression(solver="sgd", epochs=5) X_weighted = self.weights[time_step].reshape(-1, 1) * X weak_classifier.fit(X_weighted, y) pred = weak_classifier.predict(X) # Get weighted error weighted_sample_err = (np.sum( (pred != y) * self.weights)) / np.sum(self.weights) # Alpha for current classifer alpha_t = 1 / 2 * np.log(( (1 - weighted_sample_err) / weighted_sample_err) + 1e-16) self.alphas[time_step] = alpha_t self.estimators[time_step] = weak_classifier # Update weights of next time step for all data points if time_step == (self.n_estimators - 1): break self.weights[time_step + 1, :] = self.weights[time_step, :] * np.exp( -y * alpha_t * pred)
def test_logistic_regression(): data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression(gradient_descent=True) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the result Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
def fitting(): data = pd.read_csv('microchips.txt', names=['test1', 'test2', 'result']) x1 = data['test1'] x2 = data['test2'] y = data['result'] x = map_feature(x1.values, x2.values) alpha = 0.1 max_iter = 1500 model = LogisticRegression(alpha, max_iter, 0) loss, _ = model.fit(x, y, False) p = model.predict(x, False) p = [1 if i>0.5 else 0 for i in p] tp = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 1]) tn = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 0]) fp = sum([1.0 for vp, vy in zip(p, y) if vp == 1 and vy == 0 ]) fn = sum([1.0 for vp, vy in zip(p, y) if vp == 0 and vy == 1]) print(tp, tn, fp, fn) print('Accuracy %.3f' % ((tp + tn)/(tp + tn + fp + fn))) print('Precision %.3f' % (tp/(tp + fp))) print('Recall %.3f' % (tp/(tp + fn))) plt.figure(figsize=(6, 8)) plt.subplot(2, 1, 1) plt.plot(np.arange(1, max_iter+1), loss) plt.subplot(2, 1, 2) positive = data[data['result'] == 1] negative = data[data['result'] == 0] plt.plot(positive['test1'], positive['test2'], 'k+') plt.plot(negative['test1'], negative['test2'], 'yo') x1, x2 = np.mgrid[-1:1.5:50j, -1:1.5:50j] p = np.zeros((50, 50)) for i in range(50): for j in range(50): x = map_feature(np.array([x1[i, j]]), np.array([x2[i, j]])).squeeze() p[i, j] = x.dot(model.w) plt.contour(x1, x2, p, [0]) plt.show()
# Get a baseline base_accuracy = np.mean(y_test == 0) # print('ROC AUC: {:.4f}'.format(roc_auc)) print('F1 Score: {:.4f}'.format(f1_value)) print('Accuracy: {:.2f}%'.format(100 * accuracy)) print('Baseline Accuracy: {:.2f}%'.format(100 * base_accuracy)) from sklearn.model_selection import train_test_split # Features and target X = data.copy() y = X.pop('target') # Split into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) import sys sys.path.insert(0, '../') from logistic_regression import LogisticRegression lr_ = LogisticRegression(learning_rate=.1, gradient_descent=True) lr_.fit(X_train, y_train) evaluate(lr_, X_test, y_test) # from linear_model import LogisticRegression # lr_1 = LogisticRegression() # lr_1.fit(X_train, y_train) # evaluate(lr_1, X_test, y_test)
mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes" naive_bayes.fit(X_train, y_train) print "\tLogistic Regression" logistic_regression.fit(X_train, y_train) print "\tMultilayer Perceptron" mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1) print "\tPerceptron" perceptron.fit(X_train, y_train) print "\tDecision Tree" decision_tree.fit(X_train, y_train) print "\tRandom Forest" random_forest.fit(X_train, y_train) print "\tSupport Vector Machine" support_vector_machine.fit(X_train, rescaled_y_train) # ......... # PREDICT # ......... y_pred = {}
from sklearn.cross_validation import train_test_split # Read the training data f = open("../data/train.csv") reader = csv.reader(f) next(reader, None) # skip header data = [data for data in reader] f.close() X = np.asarray([x[1:] for x in data], dtype=np.int16) y = np.asarray([x[0] for x in data], dtype=np.int16) X = np.true_divide(X, 255) # normalize image data to 0-1 del data # free up the memory print("loaded training data") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RandomState()) lr = LogisticRegression(C=0.35) lr.fit(X_train, y_train, 10) guesses = lr.predict(X_test) score = 0.0 for g in range(guesses.shape[0]): if guesses[g] == y_test[g]: score += 1 print("Score: ", score / len(guesses))
cv = KFold(n_splits=5) results = [] i = 1 # For each permutation of the parameters for learning_rate, tol, l2_strength in itertools.product(learning_rates, tols, l2_strengths): print(f'{i} of 27 permutations...') cv = KFold(n_splits=5) clf = LogisticRegression(learning_rate=learning_rate, tol=tol, l2_strength=l2_strength, max_iter=500) scores = [] # to store score from each split # Compute cross validation score for train_index, validation_index in cv.split(X_train): #train clf.fit(X_train[train_index, :], y_train[train_index]) #score scores.append(clf.score(X_train[validation_index, :], y_train[validation_index])) # Store the results cv_score = np.mean(scores) params = {'learning_rate': learning_rate, 'tol': tol, 'l2_strength': l2_strength, 'cv_score': cv_score} results.append(params) i += 1 # Print the parameters that gave the best CV score best_score = 0
from util import read_file from logistic_regression import LogisticRegression data, labels = read_file('../1571/train.txt') data_train, data_valid, labels_train, labels_valid = \ train_test_split(data, labels, test_size=0.3, random_state=0) mus = list(10 ** x for x in range(-8, 2)) sgd_scores = [] for mu in mus: sgd_model = LogisticRegression(method="sgd", mu=mu, rate=0.1, decay=0.6, random_state=0) sgd_model.fit(data_train, labels_train) predicted = sgd_model.predict(data_valid) sgd_scores.append(accuracy_score(labels_valid, predicted)) pp.figure() pp.xscale('log') pp.scatter(mus, sgd_scores) pp.xlabel('regularization strength') pp.ylabel('accuracy') pp.savefig('./sgd_regularization.png') lbfgs_scores = [] for mu in mus: sgd_model = LogisticRegression(method="lbfgs", mu=mu, rate=0.1, decay=0.6, random_state=0)
def evaluate_performance(): ''' Evaluate the performance of decision trees and logistic regression, average over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ** Note that your implementation must follow this API** ''' # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array(data[:, 0]) n, d = X.shape all_accuracies_dt = [] all_accuracies_lr = [] all_accuracies_rf = [] for trial in range(1): idx = np.arange(n) np.random.shuffle(idx) X = X[idx] y = y[idx] ind = np.arange(X.shape[0]) classifier_dt = DecisionTree(9) classifier_lr = LogisticRegression(max_steps=10000, epsilon=1e-6, step_size=1, l=3) classifier_rf = RandomForest(ratio_per_tree=0.5, num_trees=100, max_tree_depth=8) scores_dt = [] scores_lr = [] scores_rf = [] for i in range(10): test_ind = np.random.choice(ind, int(X.shape[0] / 10), replace=False) ind = np.setdiff1d(np.arange(X.shape[0]), test_ind) X_train, Y_train, X_test, Y_test = X[ind], y[ind], X[test_ind], y[ test_ind] # train the decision tree classifier_dt.fit(X_train, Y_train) accuracy_dt = accuracy_score( Y_true=Y_test, Y_predict=classifier_dt.predict(X_test)) scores_dt.append(accuracy_dt) # train the logistic regression classifier_lr.fit( np.hstack((np.ones(len(X_train)).reshape(len(X_train), 1), X_train)), Y_train) accuracy_lr = accuracy_score(Y_true=Y_test, Y_predict=classifier_lr.predict( np.hstack( (np.ones(len(X_test)).reshape( len(X_test), 1), X_test)))) scores_lr.append(accuracy_lr) # train the random forest classifier_rf.fit(X_train, Y_train) accuracy_rf = accuracy_score( Y_true=Y_test, Y_predict=classifier_rf.predict(X_test)[0]) scores_rf.append(accuracy_rf) all_accuracies_dt.append(np.mean(scores_dt)) all_accuracies_lr.append(np.mean(scores_lr)) all_accuracies_rf.append(np.mean(scores_rf)) # compute the training accuracy of the model meanDecisionTreeAccuracy = np.mean(all_accuracies_dt) stddevDecisionTreeAccuracy = np.std(all_accuracies_dt) meanLogisticRegressionAccuracy = np.mean(all_accuracies_lr) stddevLogisticRegressionAccuracy = np.std(all_accuracies_lr) meanRandomForestAccuracy = np.mean(all_accuracies_rf) stddevRandomForestAccuracy = np.std(all_accuracies_rf) # make certain that the return value matches the API specification stats = np.zeros((3, 2)) stats[0, 0] = meanDecisionTreeAccuracy stats[0, 1] = stddevDecisionTreeAccuracy stats[1, 0] = meanRandomForestAccuracy stats[1, 1] = stddevRandomForestAccuracy stats[2, 0] = meanLogisticRegressionAccuracy stats[2, 1] = stddevLogisticRegressionAccuracy return stats
fig = plt.figure(figsize=(8, 6)) plt.scatter(X[:,0], X[:,1], c=y_true) plt.title("Dataset") plt.xlabel("First Feature") plt.ylabel("Second Feature") plt.show() y_true = y_true[:, np.newaxis] X_train, X_test, y_train, y_test =train_test_split(X, y_true) print(f'Shape X_train: {X_train.shape}') print(f'Shape y_train: {y_train.shape}') print(f'Shape X_test: {X_test.shape}') print(f'Shape y_test: {y_test.shape}') lr = LogisticRegression() theta, bias, costs = lr.fit(X_train, y_train, n_iter=500, learning_rate=0.008) fig = plt.figure(figsize=(8,6)) plt.plot(np.arange(500), costs) plt.title("Development of cost over training") plt.xlabel("Number of iterations") plt.ylabel("Cost") plt.show() y_p_train = lr.predict(X_train) y_p_test = lr.predict(X_test) print(f"train accuracy: {100 - np.mean(np.abs(y_p_train - y_train)) * 100}%") print(f"test accuracy: {100 - np.mean(np.abs(y_p_test - y_test))}%")
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from logistic_regression import LogisticRegression bc = datasets.load_breast_cancer() print(type(bc)) X, y = bc.data, bc.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) def accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) regressor = LogisticRegression(lr=0.001, n_iters=1000) regressor.fit(X_train, y_train) predictions = regressor.predict(X_test) print(f"LR accuracy: {accuracy(y_test, predictions)}")
mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes" naive_bayes.fit(X_train, y_train) print "\tLogistic Regression" logistic_regression.fit(X_train, y_train) print "\tMultilayer Perceptron" mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1) print "\tPerceptron" perceptron.fit(X_train, y_train) print "\tDecision Tree" decision_tree.fit(X_train, y_train) print "\tRandom Forest" random_forest.fit(X_train, y_train) print "\tSupport Vector Machine" support_vector_machine.fit(X_train, rescaled_y_train) # ......... # PREDICT # ......... y_pred = {}