def main(): neg_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.neg") pos_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.pos") nb = NaiveBayes(neg_revs, pos_revs, val_split=0.2) nb.evaluate_naive_bayes() lr = LogisticRegression(neg_revs, pos_revs, val_split=0.2, lr=0.85, num_inter=1000) lr.evaluate_logistic_regression() lr = LogisticRegression(neg_revs, pos_revs, val_split=0.2, lr=0.85, num_inter=3000) lr.evaluate_logistic_regression() # Just for fun – tensorflow LogisticRegression_tf(neg_revs, pos_revs, val_split=0.2, lr=0.01, num_inter=200)
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) clf1 = linear_model.LogisticRegression() clf1 = LogisticRegression() clf1.fit(X_train, y_train) y_pred = clf1.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("sklearn lr Accuracy:", accuracy) clf2 = LogisticRegression() clf2.fit(X_train, y_train) y_pred = clf2.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Our lr Accuracy:", accuracy)
def p02cde(train_path, valid_path, test_path, pred_path): """Logistic regression with Newton's Method Args: train_path: Path to CSV file containing dataset for training. validation_path: Path to CSV file containing dataset for evaluation. test_path: Path to CSV file containing dataset for testing. pred_path: Path to save predictions. """ pred_path_c = pred_path.replace(WILDCARD, "c") pred_path_d = pred_path.replace(WILDCARD, "d") pred_path_e = pred_path.replace(WILDCARD, "e") # Part (c) # Train classifier x_train, y_train = utils.load_dataset(train_path, label_col="t", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) # Validate classifier x_test, y_test = utils.load_dataset(valid_path, label_col="t", add_intercept=True) t_pred = model.predict(x_test) utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_c)) np.savetxt(pred_path_c, t_pred) # Part (d) x_train, y_train = utils.load_dataset(test_path, label_col="y", add_intercept=True) model = LogisticRegression() model.fit(x_train, y_train) # Validate classifier x_test, y_test = utils.load_dataset(test_path, label_col="t", add_intercept=True) y_pred = model.predict(x_test) utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_d)) np.savetxt(pred_path_d, y_pred) # Part (e) find corrections x_val, y_val = utils.load_dataset(valid_path, label_col="y", add_intercept=True) x_in_V = [x_train[i] for i in len(x_train) if y_train == 1] h = model.predict(x_in_V) alpha = np.mean(h)
def one_vs_all(X, y, lam): """ 多値分類の判別器 Parameters -------------------- X: np.array(n,d) データ y: np.array(n) ラベル(k種類) lam: int 正則化項の係数 Returns: W: np.array(d,k) W = (w1, w2, ..., wk) 各w_iはi番目の要素とその他の要素を分類する判別直線の係数 """ labels = np.unique(y) X = np.insert(X, 3, 1, axis=1) n, d = X.shape w = np.empty((d,len(labels))) for i, main_label in enumerate(labels): label = np.array([1 if y_i == main_label else -1 for y_i in y]) lr = LogisticRegression(X, label, lam) eta_t = lambda t: 1/(t+1) w[:,i], _ = lr.steepest_gradient_descent(learning_rate=eta_t, max_itr=1000) # bc_plot(X[:,1:3], label, w[1:4,i]) return w
def test_loss(self): model = LogisticRegression(2) model.w = np.random.random(2) * 2 - 1 random_data = np.random.random((3, 2)) * 2 - 1 random_labels = np.random.randint(0, 2, 3) self.assertTrue(model.loss(random_data, random_labels) >= -1e-8) self.assertTrue(model.loss(random_data, 1 - random_labels) >= -1e-8)
def test1(): from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) y = np.vectorize(lambda x: 1 if x!=0 else x)(y) clf = LogisticRegression().fit(X, y) prediction = clf.predict([[5.3, 3.9, 1.2, 0.1], [1.3, 1.9, 0.2, 0.1], [11.3, 1.9, 0.2, 0.1]]) print(prediction)
def train(train_X, train_y, val_X=None, val_y=None, factor=1, bias=0, num_epochs=1000, step_size=1e-3, check_grad=False, verbose=False): """ This function trains a logistic regression model on the given training data. Args: - train_X (ndarray (shape: (N, D))): A NxD matrix containing N D-dimensional training inputs. - train_y (ndarray (shape: (N, 1))): A N-column vector containing N scalar training outputs (labels). - val_X (ndarray (shape: (M, D))): A NxD matrix containing M D-dimensional validation inputs. - val_y (ndarray (shape: (M, 1))): A N-column vector containing M scalar validation outputs (labels). Initialization Args: - factor (float): A constant factor to scale the initial weights. - bias (float): The bias value Learning Args: - num_epochs (int): Number of gradient descent steps NOTE: 1 <= num_epochs - step_size (float): Gradient descent step size - check_grad (bool): Whether or not to check gradient using finite difference. - verbose (bool): Whether or not to print gradient information for every step. """ train_accuracy = 0 # ==================================================== # TODO: Implement your solution within the box # Step 1: Initialize model and initialize weights model = LogisticRegression(np.shape(train_X)[1], len(np.unique(train_y))) model.init_weights(factor, bias) # Step 2: Train the model model.learn(train_X, train_y, num_epochs, step_size, check_grad, verbose) # Step 3: Evaluate training performance train_probs = model.predict(train_X) # ==================================================== train_preds = np.argmax(train_probs, axis=1) train_accuracy = 100 * np.mean(train_preds == train_y.flatten()) print("Training Accuracy: {}%".format(train_accuracy)) if val_X is not None and val_y is not None: validation_accuracy = 0 # ==================================================== # TODO: Implement your solution within the box # Evaluate validation performance val_probs = model.predict(val_X) # ==================================================== val_preds = np.argmax(val_probs, axis=1) validation_accuracy = 100 * np.mean(val_preds == val_y.flatten()) print("Validation Accuracy: {}%".format(validation_accuracy))
def main(): # Init Crypten and disable OpenMP threads (needed by @mpc.run_multiprocess crypten.init() torch.set_num_threads(1) lr = LogisticRegression() lr.train(init_w, training_samples, alpha)
def test_fit_functional(): import sklearn.model_selection import numpy as np from logistic_regression import LogisticRegression, accuracy X = np.zeros((900, 3), dtype=np.float32) num_samples = 30 xx = np.linspace(-5, 5, num_samples) XX, YY = np.meshgrid(xx, xx) X[:, 0] = XX.flatten() X[:, 1] = YY.flatten() X[:, -1] = 1 # a column of 1's for the bias trick Z = 0.1 * XX + 0.2 * YY + 0.4 y = Z.reshape(-1, 1) X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y) model = LogisticRegression(input_dimensions=2) train_xent, val_xent = model.fit(X_train, y_train, X_val, y_val, num_epochs=20, batch_size=4, alpha=0.1, _lambda=0.0) predictions = model.predict(X_val) assert accuracy(predictions, y_val) >= 0.65 assert accuracy(predictions, y_val) >= 0.90 assert accuracy(predictions, y_val) >= 0.99
def iris_classification(): print('\nIris classification using Logistic Regression\n') print('Initiating Data Load...') iris = datasets.load_iris() # X, y = iris.data, iris.target # y = one_hot_encode(y) X, y = iris.data[iris.target != 2], iris.target[iris.target != 2] y = y.reshape(y.shape[0], 1) size = len(X) indices = list(range(size)) np.random.shuffle(indices) X, y = np.array([X[idx] for idx in indices ]), np.array([y[idx] for idx in indices]) train_size = int(0.8 * len(X)) X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] print('Data load complete!') print('Constructing classifier...') size = (X_train.shape[-1], y_train.shape[-1]) classifier = LogisticRegression(size) classifier.fit(X_train, y_train) print('Generating test predictions...') predictions = classifier.predict(X) accuracy = np.sum( [all(y_true == y_pred) for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100. print("Accuracy = {:.2f}%".format(accuracy))
def digit_recognition(): print('\nDigit recognition using Logistic Regression\n') print('Initiating Data Load...') digits = datasets.load_digits() X, y = digits.data, digits.target pca = PCA() X = pca.transform(X, num_components=23) y = one_hot_encode(y) size = len(X) indices = list(range(size)) np.random.shuffle(indices) X, y = np.array([X[idx] for idx in indices ]), np.array([y[idx] for idx in indices]) train_size = int(0.8 * len(X)) X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] print('Constructing classifier...') size = (X_train.shape[-1], y_train.shape[-1]) classifier = LogisticRegression(size) classifier.fit(X_train, y_train) print('Generating test predictions...') predictions = classifier.predict(X) accuracy = np.sum( [all(y_true == y_pred) for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100. print("Accuracy = {:.2f}%".format(accuracy))
def main(): # Get training matrices for logistic regression model x, y = get_train_matrices() # Create instance of LogisticRegression with the training matrices logistic_regression = LogisticRegression(x, y) # Fit with learning rate, no of iterations and regularization(L2) parameter logistic_regression.fit(0.01, 100000, 0) # Print weights and biases and the plot and also print the performance estimators of the model print("So, the weights and biases become:\nWeights:\n {}\nBiases:\n {}" .format(logistic_regression.w, logistic_regression.c)) # Validate the model by printing the performance metrics logistic_regression.validate() # Graph the curve of cost vs no of epochs logistic_regression.graph_cost_vs_epochs() # Predict for the input data in test folder and save as output.csv in test folder x_test = pd.read_csv('test/input.csv').values[:, 1:] y_test = logistic_regression.predict(x_test) df_predict = pd.DataFrame({'y': y_test.reshape(-1)}) df_predict.to_csv('test/output.csv')
def test_fit_functional(): import sklearn.model_selection import sklearn.datasets import numpy as np from logistic_regression import LogisticRegression, accuracy X = np.zeros((1000, 3), dtype=np.float32) X[:, -1] = 1 features, targets = sklearn.datasets.make_blobs(1000, 2, 2, cluster_std=1, random_state=1234) X[:, [0, 1]] = features y = targets[:, np.newaxis] X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y) model = LogisticRegression(input_dimensions=2) train_xent, val_xent = model.fit(X_train, y_train, X_val, y_val, num_epochs=20, batch_size=4, alpha=0.1, _lambda=0.0) predictions = model.predict(X_val) assert accuracy(predictions, y_val) >= 0.65 assert accuracy(predictions, y_val) >= 0.90 assert accuracy(predictions, y_val) >= 0.99
def logistic_regression_convert(): reset = myhdl.Signal(bool(0)) clk = myhdl.Signal(bool(0)) LEN_THETA=3 NB_PIPELINE_STAGES = 5 DATAWIDTH=32 CHANNEL_WIDTH=1 INIT_DATA=0 #(0 for myhdl.intbv) # --- Pipeline Pars pars=LogisticRegressionPars() pars.NB_PIPELINE_STAGES=NB_PIPELINE_STAGES pars.DATAWIDTH=DATAWIDTH pars.CHANNEL_WIDTH=CHANNEL_WIDTH pars.INIT_DATA=INIT_DATA pars.LEN_THETA=LEN_THETA pars.CMD_FILE='tb/tests/mult_pipeline.list' lRIO=LogisticRegressionIo() lRIO(pars) lRModule=LogisticRegression() lRInst=lRModule.block_connect(pars, reset, clk, lRIO.pipe_inpA, lRIO.pipe_inpB, lRIO.pipe_out_activ ) lRInst.convert(hdl='Verilog', path = "converted_hdl", name="logistic_regression") lRInst.convert(hdl='VHDL', path = "converted_hdl", name="logistic_regression")
def main(): """ Main function :return: None """ #x_train, y_train, x_test, y_test = gaussians_dataset(2, [40, 25], [[1, 2], [10, 40]], [[10, 11], [14, 20]]) x_train, y_train, train_names, x_test, y_test, test_names, feature_names = load_got_dataset( path='data/got.csv', train_split=0.8) logistic_regression = LogisticRegression() logistic_regression.fit_gradient_descent(x_train, y_train, num_epochs=10000, learning_rate=0.01, verbose=True) predictions = logistic_regression.predict(x_test) accuracy = float(np.sum(predictions == y_test)) / y_test.shape[0] print('Test accuracy: {}'.format(accuracy)) # Test plot_boundary(x_test, test_names, logistic_regression)
def __init__(self,random_stream,input,n_in,n_hidden,n_out = 10): self.hidden_layer = Hidden(rng = random_stream, input = input, n_in = n_in, n_out = n_hidden, activation = T.tanh) self.LogisticRegressionLayer = LogisticRegression(input = self.hidden_layer.output, n_in = n_hidden, n_out = 10) ## compute l1 norm (sum) and squared l2 norm self.L1 = ( abs(self.hidden_layer.W).sum() + abs(self.LogisticRegressionLayer.W).sum() ) self.L2 = ( (self.hidden_layer.W **2).sum() + (self.LogisticRegressionLayer.W **2).sum() ) self.neg_loglikelihood = self.LogisticRegressionLayer.negative_log_likelihood self.error = self.LogisticRegressionLayer.error self.params = self.hidden_layer.params + self.LogisticRegressionLayer.params self.input = input
def test_logistic_regression(): X = np.random.normal(size=(100, 2)) y = np.where(X[:, 0] > 0.5, 1, 0).reshape(-1, 1) lr = LogisticRegression() lr.fit(X, y) pred = 1 if lr.predict(X)[-1] > 0.5 else 0 assert pytest.approx(pred) == y[-1]
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy, legend_labels=data.target_names)
def test_integ_fit(): test_x = [np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])] test_y = [np.array([1, 0, 1])] expected = [np.array([0.01328192, 0.06222676, 0.1111716])] lr_model = LogisticRegression() for idx in range(len(test_x)): lr_model.fit(test_x[idx], test_y[idx]) assert pytest.approx(expected[idx], 1e-06) == lr_model.parameters
def test_l2_regularization_gradient(): from logistic_regression import LogisticRegression model = LogisticRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T gradient = model._l2_regularization_gradient() desired = np.float32([[1, 2, 4]]).T assert np.allclose(gradient, desired, rtol=1e-3, atol=1e-3) or np.allclose(gradient, 2*desired, rtol=1e-3, atol=1e-3)
def test_predict(): from logistic_regression import LogisticRegression model = LogisticRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T X = np.float32([[1, 2, 1], [0, 0, -2]]) desired = np.float32([[1, 0]]).T actual = model.predict(X) np.testing.assert_allclose(actual, desired, rtol=1e-3, atol=1e-3)
def main(): dirname = os.path.dirname(__file__) output_dirname = os.path.join(dirname, 'results') try: os.stat(output_dirname) except: os.mkdir(output_dirname) file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) #features = X[0,:] X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() l = LogisticRegression(X=X, y=y) l.train()
def test_loss(): test_x = [np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])] test_y = [np.array([1, 0, 1])] test_beta = [np.array([1.2, 3.4, 2.5])] expected = [-22.599999] lr_model = LogisticRegression() for idx in range(len(test_x)): actual = lr_model._loss(test_x[idx], test_y[idx], test_beta[idx]) assert pytest.approx(expected[idx], 1e-06) == actual
def get(self, algorithm='logistic'): if 'logistic' in algorithm.lower(): return LogisticRegression(self.data, self.labels) elif 'hinge' in algorithm.lower(): return HingeLoss(self.data, self.labels)
def get_predictions_logistic_regression(train_data, train_target, test_data, q_tag=None): from logistic_regression import LogisticRegression lr = LogisticRegression(serial_filename=get_serial_filename_lr( q_tag=q_tag)) lr.train(train_data, train_target) return lr.get_predictions(test_data)
def __init__(self, rng, input, n_in, n_hidden, n_out): ''' @rng -type : numpy.random.RandomState -param : a random number generator used to initialize weights @input -type : theano.tensor.TensorType -param : a symbolic variable that describes the input of the architecture @n_in -type : int -param : number of input units, the dimension of the space in which the datapoints lie @n_hidden -type : int -param : number of hidden units @n_out -type : int -param : number of output units, the dimension of the space in which the labels lie ''' self.hiddenLayer = HiddenLayer( rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh ) self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out ) # Regularization options self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum() ) self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum() ) self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + self.logRegressionLayer.params self.input = input
def test_cross_entropy_gradient(): from logistic_regression import LogisticRegression model = LogisticRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T X = np.float32([[1, 2, 1], [0, 0, 1]]) y = np.float32([[1, 0]]).T gradient = model._binary_cross_entropy_gradient(X, y) desired = np.float32([[-6e-5, -1e-4, 0.4909]]).T np.testing.assert_allclose(gradient, desired, rtol=1e-3, atol=1e-3)
def fitting(): data = pd.read_csv('student_score.txt', names=['Exam1', 'Exam2', 'admission']) x = data[['Exam1', 'Exam2']] y = data['admission'] print(x.mean()) print(x.max() - x.min()) x = (x - x.mean()) / (x.max() - x.min()) alpha = 10 max_iter = 150 model = LogisticRegression(alpha, max_iter) loss, _ = model.fit(x, y) p = model.predict( np.array([[ 1, (45.0 - 65.644274) / 69.769035, (85.0 - 66.221998) / 68.266173 ]]), False) print('Predict %.3f when Exam1 euqals 45 and Exam2 equals 85' % p) plt.subplot(2, 1, 1) plt.plot(np.arange(1, max_iter + 1), loss) plt.title('Loss Curve') plt.subplot(2, 1, 2) negative = data[data['admission'] == 0] positive = data[data['admission'] == 1] plt.plot(negative['Exam1'], negative['Exam2'], 'yo') plt.plot(positive['Exam1'], positive['Exam2'], 'k+') print(model.w) bx = data['Exam1'] by = (-68.266173 / model.w[2]) * (( (bx - 65.644274) / 69.769035) * model.w[1] + model.w[0]) + 66.221998 x = data[['Exam1', 'Exam2']] x = (x - x.mean()) / (x.max() - x.min()) p = [1 if i >= 0.5 else 0 for i in model.predict(x)] tp = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 1]) tn = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 0]) fp = sum([1.0 for vp, vy in zip(p, y) if vp == 1 and vy == 0]) fn = sum([1.0 for vp, vy in zip(p, y) if vp == 0 and vy == 1]) print(tp, tn, fp, fn) print('Accurancy %.2f' % ((tp + tn) / (tp + tn + fp + fn))) print('Precision %.2f' % (tp / (tp + fp))) print('Recall %.2f' % (tp / (tp + fn))) plt.plot(bx, by) plt.show()
def train(): fname = sys.argv[1] output_fname = sys.argv[2] X, y = get_data(data=read_train_csv(fname)) model = LogisticRegression(iteration=30000) model.fit(X, y) model.save(output_fname)
def init_model(model_type, delta, area_width): if model_type == 'LR': return LogisticRegression(delta, area_width) elif model_type == 'DT': return DecisionTree(delta) elif model_type == 'RF': return RandomForest(delta) else: raise SolverException('Invalid model type: ' + Fore.MAGENTA + model_type + Fore.RESET)