def init_bfgs(self, X, y): """ Initializes the BFGS algorithm Parameters ---------- X : N x D matrix composed of numerical features, where each feature is 1 x D y : N X 1 matrix, where y is a bit vector corresponding to whether each of the N features is in class 1 or 0 Returns ------- W : initial weights B : 2 x D x D "matrix", where the first is the initial pseudo-Hessian matrix and the second is empty G : Gradient matrix containing the initial gradient vector and an empty slot for the next gradient vector """ W = np.zeros(shape=(2, X.shape[1])) # initializing weights B = np.zeros(shape=(2, X.shape[1], X.shape[1])) B[0] = np.diag(np.ones(shape=X.shape[1]) ) # Initializing pseudo-hessian to identity matrix pi = expit(safedot(X, W[0])) G = np.empty(shape=(2, X.shape[1])) G[0] = safedot(X.T, (pi - y)) return W, B, G
def l2_distance(self, X, x, gen_dist, kernel, **kwargs): """ Computes Euclidean distance between a single feature observation from testing data and each observation from training data Parameters ---------- X : N x D matrix consisting of N observations of data x : D x 1 vector consisting of a single observation gen_dist : pre computed squared norm of each row vector in X kernel : function that will apply a pseudo projection of the features into a space of different dimensions Returns ------- N x 1 vector of distances between x and each observation in X """ if kernel is None: t_1 = gen_dist t_2 = safedot(x, x) t_3 = 2 * safedot(X, x) return t_1 + t_2 - t_3 else: distances = np.zeros(X.shape[0]) for i in range(X.shape[0]): t_2 = kernel(x, x, **kwargs) t_3 = kernel(X[i], x, **kwargs) distances[i] = gen_dist[i] + t_2 - (2 * t_3) return distances
def gaussian(self, x, x_, **kwargs): """ Computes the Gaussian(RBF) kernel of the two given vectors Parameters ---------- x : D x 1 feature vector x_ : D x 1 feature vector sigma : parameter controlling the bandwith of the kernel Returns ------- kernelized inner product of the two given vectors """ if kwargs: try: sigma = kwargs['sigma'] except KeyError: raise ValueError( 'Must use proper parameters of Gaussian(RBF) kernel') else: sigma = 30 gamma = 1 / (2 * np.square(sigma)) sq_norm = safedot(x, x) + safedot(x_, x_) - (2 * safedot(x, x_)) n = gamma * sq_norm return np.exp(-n)
def conjugate_gradient(self, A, b, epsilon, x=None): """ Solves linear equation Ax = b using conjugate gradients. This Algorithm can be found on pg. 111 of "Numerical Analysis" Parameters ---------- A : In the context of newton methods, A is the Hessian matrix b : In the context of newton methods, b is the gradient vector epsilon : Since covergence to 0 would be a waste of computational resources, epsilon is used to determine if the residual is "close-enough" to 0 x : can be given if there is a desired starting Returns ------- x : approximate solution to the linear equation of the form Ax = b """ if x is None: x = np.ones(b.shape[0]) # Initialize random x r_0 = safedot(A, x) - b # Calculating residual p = -r_0 # original direction while la.norm(r_0) > epsilon: alpha = safedot(r_0, r_0) / safedot(safedot(p, A), p) # Creating 1-D minimizer x = x + (alpha * p) # Updating x r_1 = r_0 + (alpha * safedot(A, p)) # Updating residual beta = safedot(r_1, r_1) / safedot( r_0, r_0) # matrix to ensure conjugacy between new direct p and A p = -r_1 + safedot(beta, p) # updating direction r_0 = r_1 return x
def get_bernoulli_likelihood(self, X): """ Calculates the likelihood of each observation in X being in a class given that the features in X are either 0 or 1 Parameters ---------- X : N x D matrix of 1 x D feature vectors """ log_p = np.log(self.p_matrix) log_p_not = np.log(1 - self.p_matrix) a = safedot(1 - X, log_p_not.T) b = safedot(X, log_p.T) self.predictions = np.argmax(np.log(self.priors) + a + b, axis = 1)
def fit(self, X, y, max_iters=10000, save_weights=False, epsilon=1e-5): """ Finds optimal weights by adjusting them only when incorrect observations are made Parameters ---------- X : N x D matrix composed of numerical features, where each feature is 1 x D y : N X 1 matrix, where y is a bit vector corresponding to whether each of the N features is in class 1 or 0 max_iters : maximum number of iterations before the algorithm will terminate save_weights : whether or not the weights obtain from each iteration should be saved epsilon : small number to be used to test for approximate convergence when determining the direction to be descended """ if not np.allclose(np.unique(y), np.array([-1, 1])): y[y == 0] = -1 w = -np.ones(shape=X.shape[1]) + np.finfo('float').resolution W = np.zeros(shape=(max_iters, w.shape[0])) s = np.zeros(w.shape[0]) for i in range(max_iters): idx = i % (X.shape[0] - 1) x = X[idx] y_hat = np.sign(safedot(x, w)) # Only updates when an incorrect predict is made if not np.allclose(y_hat, y[idx]): w += y[idx] * x W[i] = w if i > 50 and la.norm(W[i] - W[i - 50]) < epsilon: break self.w = w self.W = W
def polynomial(self, x, x_, **kwargs): """ Computes the polynomial kernel of the two given vectors Parameters ---------- x : D x 1 feature vector x_ : D x 1 feature vector c : C >= 0 is a free parameter that controls the influence of higher order terms versus lower order terms in polynomial d : scaling degree Returns ------- Kernelized inner product of the two given vectors """ if kwargs: try: c = kwargs['c'] d = kwargs['d'] except KeyError: raise ValueError( 'Must use proper arguments for polynomial kernel') else: c = 1 d = 2 n = safedot(x, x_) + c return np.power(n, d)
def irls(self, X, y, iterations=50, l2_reg=0, save_weights=False): """ Solving the likelihood function using Iteratively Reweighted Least Squares method Parameters ---------- X : N x D matrix composed of numerical features, where each feature is 1 x D y : N X 1 matrix, where y is a bit vector corresponding to whether each of the N features is in class 1 or 0 max_iters : maximum number of iterations before the algorithm will terminate l2_reg : amount of l2 regularization to be applied save_weights : whether or not the weights obtain from each iteration should be saved """ w = np.zeros(X.shape[1]) prediction = np.ones(X.shape[1]) if save_weights: W = np.empty(shape=((iterations), w.shape[0])) i = 0 while i < iterations: n = safedot(X, w) prediction = expit( n) # predicting new y values based on current weights s = prediction * (1 - prediction) s[s == 0] = np.finfo( 'float' ).resolution # Ensuring that matrix S will be invertible S = np.diag(s) z = n + self.score_irls(y, prediction, S) # response variable if np.allclose(z, n): if (save_weights): W = W[:i] break w_0 = la.inv(la.multi_dot((X.T, S, X))) w_1 = la.multi_dot((X.T, S, z)) w_n = safedot(w_0, w_1) w = w_n + (l2_reg * w) if save_weights: W[i] = w i += 1 self.w = w if save_weights: self.W = W
def update_B(self, B, W, G): """ Performs the update of the pseudo-Hessian Matrix Parameters ---------- B : 2 x D x D "martix" containing pseudo-Hessian matrix and an empty slot for the next one W : 2 x D matrix containing the two weights calculated from the past two iterations of the BFGS algorithm G : 2 x D matrix containing the two gradients calculated from the past two iterations of the BFGS algorithm Returns ------- B : 2 x D x D matrix containing past two pseudo hessian matrices """ dW = W[1] - W[0] dG = G[1] - G[0] t_1a = safedot(dG, dG) t_1b = safedot(dG, dW) if not t_1b: t_1 = 0 else: t_1 = t_1a / t_1b t_2a = safedot(safedot(B[0], dW), safedot(B[0], dW)) t_2b = safedot(safedot(dW, B[0]), dW) B[1] = B[0] + t_1 + (t_2a / t_2b) return B[1]
def classify(self, X): """ Uses weights calculating from fitting to form classification of new features Parameters ---------- X : M x D matrix composed of numerical features, where each feature is 1 x D """ preds = np.sign(safedot(X, self.w)) preds[preds < 1] = 0 self.predictions = preds
def bfgs(self, X, y, iterations=20, save_weights=False, t=0, k=.12): """ Implementation of the Broyden–Fletcher–Goldfarb–Shanno algorithm Parameters ---------- X : N x D matrix composed of numerical features, where each feature is 1 x D y : N X 1 matrix, where y is a bit vector corresponding to whether each of the N features is in class 1 or 0 max_iters : maximum number of iterations before the algorithm will terminate save_weights : whether or not the weights obtain from each iteration should be saved k : must be in range (0,1) and larger values will decrease step sizes more t : larger values will decrease the step size taken Note ---- Significantly faster convergence with normalized X """ W, B, G = self.init_bfgs(X, y) weights = np.empty(shape=(iterations, W.shape[1])) for i in range(iterations): n = safedot(X, W[0]) G[1] = (safedot(X.T, expit(n) - y)) / G.shape[1] d = safedot(la.pinv(B[0]), G[1]) a = self.step_size(iter_num=(i + 1) * 10, t=t, k=k) W[1] = W[0] - (a * d) B[0] = self.update_B(B, W, G) G[0] = G[1] W[0] = W[1] if save_weights: weights[i] = W[0] self.w = W[0] if save_weights: self.W = weights
def binary_newton_cg(self, X, y, max_iters=10, l2_reg=0, save_weights=False, epsilon=1e-4): """ Solves the optimization of the likelihood function using newton's method with conjugate gradients Parameters ---------- X : N x D matrix composed of numerical features, where each feature is 1 x D y : N X 1 matrix, where y is a bit vector corresponding to whether each of the N features is in class 1 or 0 max_iters : maximum number of iterations before the algorithm will terminate l2_reg : determines the amount of regularization that will be applied at each iteration save_weights : whether or not the weights obtain from each iteration should be saved epsilon : small number to be used to test for approximate convergence when determining the direction to be descended """ w = np.zeros(X.shape[1]) if save_weights: W = np.zeros(shape=(max_iters, X.shape[1])) for i in range(max_iters): mu = expit(safedot(X, w)) # Calculating predicted probabilities g = safedot(X.T, (mu - y)) # gradient vector H = safedot(safedot(X.T, np.diag(mu)), X) # hessian matrix n = self.conjugate_gradient(H, g, epsilon) # w = (w - n) + (l2_reg * w ) # updating weights with l2 regularization if save_weights: W[i] = w if save_weights: self.W = W # weights of each iteration self.w = w # final weights
def create_variables(self, X): """ Defining variables to make softmax function more convienient in the future Parameters ---------- X : N x D matrix of 1 x D feature vectors """ try: inv_sigma = la.inv(self.pooled_sigma) except: inv_sigma = la.pinv(self.pooled_sigma) self.gamma = np.zeros(shape=self.priors.shape[0]) self.beta = np.zeros(shape=(self.gamma.shape[0], X.shape[1])) for i in range(self.priors.shape[0]): a = safedot(self.means[i], inv_sigma) b = safedot(a, self.means[i]) / (-2) self.gamma[i] = b + np.log(self.priors[i]) self.beta[i] = safedot(inv_sigma, self.means[i])
def get_multinomial_likelihood(self, X): """ Calculates the likelihood of each observation in X being in a class given that the features in X are of a multinomial distribution Parameters ---------- X : N x D matrix of 1 x D feature vectors """ ll = safedot(X, self.l_p_matrix.T) l_prior = np.log(self.priors) self.predictions = np.argmax(ll + l_prior, axis = 1)
def classify(self, X): """ Performs classification of observations depending on selected model Parameters ---------- X : N x D matrix of 1 x D feature vectors """ if (self.model == 'Linear'): n = safedot(X, self.beta.T) + self.gamma self.predictions = np.argmax(self.softmax(n), axis=1) elif (self.model == 'Quadratic' or self.model == 'Regularized'): t_1 = (-0.5) * self.determinants t_2 = (-0.5) * self.compute_mahalanobis(X) self.predictions = np.argmax(t_1 + t_2, axis=1)
def score_irls(self, y, pi, S): """ Gives the working response after each update it irls algorithm Parameters ---------- y : Actual label for each feature vector pi : Prediction from [0,1], where closer proximity to 0 or 1 means it is more likely for that feature vector to be either a 0 or 1 respectively S : Diagonal matrix calculating from the decomposition of the Hessian Returns ------- error : response of predictions compared to actual values """ error = safedot(la.pinv(S), (y - pi)) return error
def convert_labels_binary(self, X, w, threshold=0.5): """ Converts values to 0 or 1 based on thresholding of expit function Parameters ---------- X : N x D matrix composed of numerical features w : Weights determined by chosen solver of what class each observation belongs to threshold : Some value in the range (0, 1) that decides the decision boundary of what class each observation belongs to Returns ------- y: bit vector containing 0 or 1 depending on which class each belongs to """ n_pred = expit(safedot(X, w)) y = self.threshold(n_pred, threshold) return y
def sigmoid(self, x, x_, **kwargs): """ Computes the sigmoid kernel of the two given vectors Parameters ---------- x : D x 1 feature vector x_ : D x 1 feature vector alpha : scaling parameter c = : shifting parameter """ if kwargs: try: alpha = kwargs['alpha'] c = kwargs['c'] except KeyError: raise ValueError( 'Must use proper arguments for sigmoid kernel function.') else: alpha = 1e-10 c = 0.5 n = alpha * safedot(x, x_) + c return np.tanh(n)