def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, rho=None, q=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ _, n_features = X.shape grad = np.empty_like(w) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # 0: noise, 1: clean if q is None: q = np.zeros_like(y) y01 = np.array(y == 1, dtype=int) w, c, yz = _intercept_dot(w, X, y) loss_yzp = -log_logistic(+yz) loss_yzn = -log_logistic(-yz) wp = 1 - np.take(rho, 1 - y01) wn = np.take(rho, y01) noise_loss = np.sum(sample_weight * (1-q) * (wp * loss_yzp - wn * loss_yzn)) / (1 - rho[0] - rho[1]) clean_loss = np.sum(sample_weight * q * loss_yzp) out = clean_loss + noise_loss + .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (q * (z-1) * y + (1-q) * (wp * (z-1) * y + wn * z * y)) grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() return out, grad
def log_loss(wp, X, target, C, PN, NP): """ It is minimized using "L-BFGS-B" method of "scipy.optimize.minimize" function, and results in similar coefficients as sklearn's Logistic Regression when PN=NP=0.0. Parameters ------------- wp: Coefficients & Intercept X: (N,M) shaped data matrix target: (N,) shaped 1-D array of targets C: Regularization PN: % of Positive samples labeled as Negative NP: % of Positive samples labeled as Negative Returns ------------ loss_value: float """ c = wp[-1] w = wp[:-1] z = np.dot(X, w) + c yz = target * z # to compute l(t,y) nyz = -target * z # to compute l(t,-y) ls = -log_logistic(yz) # l(t,y) nls = -log_logistic(nyz) # l(t,-y) idx = target == 1 # indexes of samples w/ P label loss = ls.copy() # To store l-hat loss[idx] = (1 - NP) * ls[idx] - PN * nls[idx] # Modified loss for P samples loss[~idx] = (1 - PN) * ls[~idx] - NP * nls[~idx] # Modified loss for N samples loss = loss / (1 - PN - NP) + .5 * (1. / C) * np.dot(w, w) # Normalization & regularization return loss.sum() # Final loss
def test_logistic_sigmoid(): # Check correctness and robustness of logistic sigmoid implementation def naive_log_logistic(x): return np.log(1 / (1 + np.exp(-x))) x = np.linspace(-2, 2, 50) assert_array_almost_equal(log_logistic(x), naive_log_logistic(x)) extreme_x = np.array([-100., 100.]) assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
def test_logistic_sigmoid(): """Check correctness and robustness of logistic sigmoid implementation""" naive_logistic = lambda x: 1 / (1 + np.exp(-x)) naive_log_logistic = lambda x: np.log(naive_logistic(x)) x = np.linspace(-2, 2, 50) assert_array_almost_equal(log_logistic(x), naive_log_logistic(x)) extreme_x = np.array([-100., 100.]) assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
def test_logistic_sigmoid(): # Check correctness and robustness of logistic sigmoid implementation naive_logistic = lambda x: 1 / (1 + np.exp(-x)) naive_log_logistic = lambda x: np.log(naive_logistic(x)) x = np.linspace(-2, 2, 50) with warnings.catch_warnings(record=True): assert_array_almost_equal(logistic_sigmoid(x), naive_logistic(x)) assert_array_almost_equal(log_logistic(x), naive_log_logistic(x)) extreme_x = np.array([-100., 100.]) assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
def test_logistic_sigmoid(): # Check correctness and robustness of logistic sigmoid implementation naive_logistic = lambda x: 1 / (1 + np.exp(-x)) naive_log_logistic = lambda x: np.log(naive_logistic(x)) x = np.linspace(-2, 2, 50) with warnings.catch_warnings(record=True): assert_array_almost_equal(logistic_sigmoid(x), naive_logistic(x)) assert_array_almost_equal(log_logistic(x), naive_log_logistic(x)) extreme_x = np.array([-100.0, 100.0]) assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
def _logistic_loss_and_grad(w, X, y, alpha, penalty, fit_intercept, sample_weight): n_samples, n_features = X.shape grad = np.empty_like(w) c = 0. if fit_intercept: c = w[-1] w = w[:-1] z = safe_sparse_dot(X, w) + c yz = y * z if penalty == "l2": reg = .5 * alpha * np.dot(w, w) reg_grad = alpha * w else: reg = 0 reg_grad = 0 out = -np.sum(sample_weight * log_logistic(yz)) + reg z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + reg_grad if fit_intercept: grad[-1] = z0.sum() return out, grad
def score_samples(self, X): """Compute the pseudo-likelihood of X. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- pseudo_likelihood : ndarray of shape (n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes ----- This method is not deterministic: it computes a quantity called the free energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ # Randomly corrupt one feature in each sample in v. ind = (np.arange(X.shape[0]), np.random.randint(0, X.shape[1], X.shape[0])) X_ = X.copy() X_[ind] = 1 - X_[ind] fe = self._free_energy(X) fe_ = self._free_energy(X_) return (X.shape[1] * log_logistic(fe_ - fe)).mean()
def _logistic_loss(w, X, y, alpha, sample_weight=None): """Computes the logistic loss. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. """ w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) return out
def score_samples(self, X): """Compute the pseudo-likelihood of X. Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- pseudo_likelihood : array-like, shape (n_temperatures, n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes ----- This method is not deterministic: it computes a quantity called the free energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ check_is_fitted(self, "components_") v = check_array(X, accept_sparse='csr') rng = check_random_state(self.random_state) # Randomly corrupt one feature in each sample in v. ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0])) if issparse(v): data = -2 * v[ind] + 1 v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape) else: v_ = v.copy() v_[ind] = 1 - v_[ind] fe = self._free_energy(v, 0) fe_ = self._free_energy(v_, 0) return v.shape[1] * log_logistic(fe_ - fe)
def _logistic_loss(w, X, y, alpha, sample_weight=None, rho=None, q=None): """Computes the logistic loss. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. """ # 0: noise, 1: clean if q is None: q = np.zeros_like(y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) y01 = np.array(y == 1, dtype=int) w, c, yz = _intercept_dot(w, X, y) loss_yzp = -log_logistic(+yz) loss_yzn = -log_logistic(-yz) wp = (1-np.take(rho, 1-y01)) / (1-rho[0]-rho[1]) wn = ( -np.take(rho, y01)) / (1-rho[0]-rho[1]) noise_loss = np.sum(sample_weight * (1-q) * (wp * loss_yzp + wn * loss_yzn)) clean_loss = np.sum(sample_weight * q * loss_yzp) out = clean_loss + noise_loss + .5 * alpha * np.dot(w, w) return out
def _logistic_cost_grad(X,Y,w,diagA): ''' Calculates cost and gradient for logistic regression ''' n = X.shape[0] Xw = np.dot(X,w) s = expit(Xw) wdA = w*diagA wdA[0] = 1e-3 # broad prior for bias term => almost no regularization cost = np.sum( Xw* (1-Y) - log_logistic(Xw)) + np.sum(w*wdA)/2 grad = np.dot(X.T, s - Y) + wdA return [cost/n,grad/n]
def _logistic_cost_grad(X, Y, w, diagA): ''' Calculates cost and gradient for logistic regression ''' n = X.shape[0] Xw = np.dot(X, w) s = expit(Xw) wdA = w * diagA wdA[0] = 1e-3 # broad prior for bias term => almost no regularization cost = np.sum(Xw * (1 - Y) - log_logistic(Xw)) + np.sum(w * wdA) / 2 grad = np.dot(X.T, s - Y) + wdA return [cost / n, grad / n]
def logistic_loss(w, X, Y, alpha): """ Implementation of the logistic loss function when Y is a probability distribution. loss = -SUM_i SUM_k y_ik * log(P[yi == k]) + alpha * ||w||^2 """ n_classes = Y.shape[1] n_features = X.shape[1] intercept = 0 if n_classes > 2: fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: fit_intercept = w.size == (n_features + 1) if fit_intercept: intercept = w[-1] w = w[:-1] z = safe_sparse_dot(X, w.T) + intercept if n_classes == 2: # in the binary case, simply compute the logistic function p = np.vstack([log_logistic(-z), log_logistic(z)]).T else: # compute the logistic function for each class and normalize denom = expit(z) denom = denom.sum(axis=1).reshape((denom.shape[0], -1)) p = log_logistic(z) loss = -(Y * p).sum() loss += np.log(denom).sum() # Y.sum() = 1 loss += 0.5 * alpha * squared_norm(w) return loss loss = -(Y * p).sum() + 0.5 * alpha * squared_norm(w) return loss
def _logistic_cost_grad(X, Y, w, diagA, penalise_intercept): ''' Calculates cost and gradient for logistic regression ''' n = X.shape[0] Xw = np.dot(X, w) s = expit(Xw) wdA = w * diagA if not penalise_intercept: wdA[0] = 0 cost = np.sum(-Xw * Y - log_logistic(-Xw)) + np.sum(w * wdA) / 2 grad = np.dot(X.T, s - Y) + wdA return [cost / n, grad / n]
def _logistic_cost_grad(X,Y,w,diagA, penalise_intercept): ''' Calculates cost and gradient for logistic regression ''' n = X.shape[0] Xw = np.dot(X,w) s = expit(Xw) wdA = w*diagA if not penalise_intercept: wdA[0] = 0 cost = np.sum( -Xw*Y - log_logistic(-Xw)) + np.sum(w*wdA)/2 grad = np.dot(X.T, s - Y) + wdA return [cost/n,grad/n]
def pseudo_likelihood(v, weights, biases_v, biases_h): corruption = (np.arange(v.shape[0]), np.random.randint(0, v.shape[1], v.shape[0])) v_copy = v.copy() v_copy[corruption] = 1 - v_copy[corruption] energy = free_energy(v, weights, biases_v, biases_h) energy_copy = free_energy(v_copy, weights, biases_v, biases_h) likelihoods = v.shape[1] * log_logistic(energy_copy - energy) return likelihoods.mean()
def _logistic_loss_and_grad(w, X, y, alpha, mask, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. mask : array-like, shape (n_features), (n_classes, n_features) optional Masking array for coef. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ n_samples, n_features = X.shape if mask is not None: w[:n_features] *= mask grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(n_samples) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) / n_samples out += .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = (safe_sparse_dot(X.T, z0) / n_samples) + alpha * w if mask is not None: grad[:n_features] *= mask # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() / n_samples return out, grad
def predict_proba(self, X): """Predict probabilities for samples Args: X : {array-like, sparse matrix}, shape = (n_samples, n_features) Samples. Returns: array-like of shape (n_samples, n_classes): T. Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ probs = np.exp(log_logistic(self.decision_function(X))) return np.column_stack((1 - probs, probs))
def _logistic_l1_loss_and_grad(w2, X, y, alpha, penalty, fit_intercept, l1_ratio, sample_weight): n_samples, n_features = X.shape grad = np.empty_like(w2) reg_grad = np.zeros(w2.size) c = 0. if fit_intercept: c = w2[-1] w = w2[:n_features] - w2[n_features:-1] t = w2[:n_features] + w2[n_features:-1] else: w = w2[:n_features] - w2[n_features:] t = w2[:n_features] + w2[n_features:] z = safe_sparse_dot(X, w) + c yz = y * z if penalty == "l1": reg = alpha * t.sum() reg_grad = alpha elif penalty == "elasticnet": regl2 = 0.5 * (1 - l1_ratio) * alpha * np.dot(w, w) regl1 = l1_ratio * alpha * t.sum() reg = regl2 + regl1 rg1 = alpha * l1_ratio rg2 = alpha * (1 - l1_ratio) * w reg_grad[:2 * n_features] = np.concatenate([rg2, -rg2]) + rg1 out = -np.sum(sample_weight * log_logistic(yz)) + reg z = expit(yz) z0 = sample_weight * (z - 1) * y g = safe_sparse_dot(X.T, z0) if fit_intercept: grad[:n_features] = g grad[n_features:-1] = -g grad[-1] = z0.sum() else: grad[:n_features] = g grad[n_features:] = -g grad += reg_grad return out, grad
def fgrad(we, X, y, l1, l2): nsamples, nfactors = X.shape w0 = we[0] w = we[1:(nfactors + 1)] - we[(nfactors + 1):] yz = y * (safe_sparse_dot(X, w) + w0) f = -np.sum(log_logistic(yz)) + l1 * np.sum( we[1:]) + 0.5 * l2 * np.dot(w, w) e = (expit(yz) - 1) * y g = safe_sparse_dot(X.T, e) + l2 * w g0 = np.sum(e) grad = np.concatenate([g, -g]) + l1 grad = np.insert(grad, 0, g0) return f, grad
def _logistic_loss(w, X, y, alpha, sample_weight=None, rho=None, q=None): """Computes the logistic loss. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. """ w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) # add noise term if q is None: q = np.zeros_like(y) y01 = np.array(y == 1, dtype=int) qnoise = np.array(q==0, dtype=np.bool) if(q is None or np.any(qnoise)): rho_y = np.array([[rho[1-label],rho[label]] for label in y01]) yzq = yz[qnoise] wq = sample_weight[qnoise] out += np.sum(wq * log_noise_logistic(yzq, rho_y[qnoise,:])) return out
def temp_log_loss(w, X, Y, alpha): n_classes = Y.shape[1] w = w.reshape(n_classes, -1) intercept = w[:, -1] w = w[:, :-1] z = safe_sparse_dot(X, w.T) + intercept denom = expit(z) #print denom #print denom.sum() denom = denom.sum(axis=1).reshape((denom.shape[0], -1)) #print denom p = log_logistic(z) loss = -(Y * p).sum() loss += np.log(denom).sum() loss += 0.5 * alpha * squared_norm(w) return loss
def temp_log_loss(w, X, Y, alpha): n_classes = Y.shape[1] w = w.reshape(n_classes, -1) intercept = w[:, -1] w = w[:, :-1] z = safe_sparse_dot(X, w.T) + intercept denom = expit(z) #print denom #print denom.sum() denom = denom.sum(axis=1).reshape((denom.shape[0], -1)) #print denom p = log_logistic(z) loss = - (Y * p).sum() loss += np.log(denom).sum() loss += 0.5 * alpha * squared_norm(w) return loss
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ _, n_features = X.shape grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() return out, grad
def score_samples(self, X): """Compute the pseudo-likelihood of X. X : {array-like, sparse matrix} shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- pseudo_likelihood : array-like, shape (n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes ----- This method is not deterministic: it computes a quantity called the free energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ check_is_fitted(self, "components_") v = check_array(X, accept_sparse='csr') fe = self._free_energy(v) v_, state = self.corrupt(v) # TODO: If I wanted to be really fancy here, I would do one of those "with..." things. fe_corrupted = self._free_energy(v) self.uncorrupt(v, state) # See https://en.wikipedia.org/wiki/Pseudolikelihood # Let x be some visible vector. x_i is the ith entry. x_-i is the vector except that entry. # x_iflipped is x with the ith bit flipped. F() is free energy. # P(x_i | x_-i) = P(x) / P(x_-i) = P(x) / (P(x) + p(x_iflipped)) # expand def'n of P(x), cancel out the partition function on each term, and divide top and bottom by e^{-F(x)} to get... # 1 / (1 + e^{F(x) - F(x_iflipped)}) # So we're just calculating the log of that. We multiply by the number of # visible units because we're approximating P(x) as the product of the conditional likelihood # of each individual unit. But we're too lazy to do each one individually, so we say the unit # we tested represents an average. if hasattr(self, 'codec'): normalizer = self.codec.shape()[0] else: normalizer = v.shape[1] return normalizer * log_logistic(fe_corrupted - fe)
def score_samples(self, X): """Compute the pseudo-likelihood of X. Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- pseudo_likelihood : array-like, shape (n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes ----- This method is not deterministic: it computes a quantity called the free energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ check_is_fitted(self, "components_") v = check_array(X, accept_sparse='csr') rng = check_random_state(self.random_state) # Randomly corrupt one feature in each sample in v. ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0])) if issparse(v): data = -2 * v[ind] + 1 v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape) else: v_ = v.copy() v_[ind] = 1 - v_[ind] fe = self._free_energy(v) fe_ = self._free_energy(v_) return v.shape[1] * log_logistic(fe_ - fe)
def score_samples_TAP(self, X): """Compute the pseudo-likelihood of X using second order TAP Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Values of the visible layer. Must be all-boolean (not checked). Returns ------- pseudo_likelihood : array-like, shape (n_samples,) Value of the pseudo-likelihood (proxy for likelihood). Notes ----- This method is not deterministic: it computes the TAP Free Energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ check_is_fitted(self, "W") v = check_array(X, accept_sparse='csr') v, v_ = self._corrupt_data(v) fe = self._free_energy_TAP(v) fe_ = self._free_energy_TAP(v_) return v.shape[1] * log_logistic(fe_ - fe)
def _logistic_loss_and_grad(w, alpha, X, y, lamda, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ n_patients = len(X) out = 0. grad = np.zeros_like(w) sample_weight_orig = sample_weight.copy() if sample_weight is not None \ else None for i in range(n_patients): n_kernels, n_samples, n_features = X[i].shape x_i = np.tensordot(w, X[i], axes=1) alpha_i, c, yz = _intercept_dot(alpha[i], x_i, y[i]) if sample_weight_orig is None: sample_weight = np.ones(n_samples) # Logistic loss is the negative of the log of the logistic function. out += -np.sum(sample_weight * log_logistic(yz)) z = expit(yz) z0 = sample_weight * (z - 1) * y[i] grad += safe_sparse_dot(X[i].dot(alpha_i), z0) # alpha_i, c_i, x_i = _intercept_dot(alpha[i][:-1], X[i], 1.) # out_i, grad_i = _loglossgrad( # np.append(w, alpha[i][-1]), x_i.T, y[i], 0, # sample_weight=sample_weight) # out += out_i # grad += grad_i[:n_kernels] out += .5 * lamda * np.dot(w, w) grad += lamda * w return out, grad
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None, rho=None, q=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ _, n_features = X.shape grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) z = expit(yz) # add noise term if q is None: q = np.zeros_like(y) y01 = np.array(y == 1, dtype=int) qnoise = np.array(q==0, dtype=np.bool) if np.any(qnoise): rho_y = np.array([[rho[1-label],rho[label]] for label in y01]) z += expit_noise(yz, qnoise, rho_y) yzq = yz[qnoise] wq = sample_weight[qnoise] out += np.sum(wq * log_noise_logistic(yzq, rho_y[qnoise,:])) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() return out, grad
def loss(self, y, pred): return (-log_logistic(y * pred))[0][0]
def _l1_logistic_loss_grad(w_extended, X, y, C, D, k, idx, ignore2w): # print(k) _, n_features = X.shape w = w_extended[:n_features] - w_extended[n_features:] # w[regularized_alphas] = 0. yz = y * safe_sparse_dot(X, w) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(log_logistic(yz)) # out += .5 * alpha * np.dot(w, w) # L2 w_extended_ = copy.copy(w_extended) # don't regularize \alphas if ignore2w == 0: reg_idx = list(range(1, idx)) reg_idx2 = list(range(n_features + 1, n_features + idx)) w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. # model_user_event elif ignore2w == 1: reg_idx = list(range(idx, n_features)) reg_idx2 = list(range(n_features + idx, 2 * n_features)) w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. # model_user_event_fb else: reg_idx = list(range(1, idx)) + list(range(idx, n_features)) reg_idx2 = list(range(n_features, n_features + idx)) + list( range(n_features + idx, 2 * n_features)) # if len(regularized_alphas) > 0: # unpenalized_idx = list(set(list(range(0, idx - 1))) - set(regularized_alphas)) + list( # set(list(range(idx, n_features))) - set(idx + np.array(regularized_alphas))) # penalized_idx = list(regularized_alphas) + list(idx + np.array(regularized_alphas)) # w_extended_[penalized_idx] = w_extended_[penalized_idx]*1000000 # w_extended_[unpenalized_idx] = 0. # else: w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. if ignore2w > 0: w_ = w[idx:] w_ = np.transpose(w_.flatten().reshape(k, -1)) # print(w_.shape) Dsmooth = w_[1:, :] - w_[:-1, :] zero = np.zeros((1, k)) Dsmooth = np.concatenate((Dsmooth, zero), axis=0) Dsmooth = np.transpose(Dsmooth) Dsmooth = Dsmooth.flatten() Dsmooth_squared = Dsmooth * Dsmooth # out += alpha * w_extended.sum() out += C * w_extended_.sum() + 0.5 * D * Dsmooth_squared.sum( ) # L1, w_extended is non-negative z = expit(yz) z0 = (z - 1) * y grad = safe_sparse_dot(X.T, z0) grad = np.concatenate([grad, -grad]) # grad += alpha * w # L2 # grad += alpha + # L1 D_grad = np.zeros((n_features, )) D_grad[idx:] = Dsmooth D_grad = np.concatenate([D_grad, -D_grad]) grad += C - D * D_grad else: out += C + w_extended.sum() z = expit(yz) z0 = (z - 1) * y grad = safe_sparse_dot(X.T, z0) grad = np.concatenate([grad, -grad]) grad += C return out, grad
def _l1_logistic_loss_grad(w_extended, X, y, C, D, k, idx, ignore2w): # print(k) _, n_features = X.shape w = w_extended[:n_features] - w_extended[n_features:] yz = y * safe_sparse_dot(X, w) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(log_logistic(yz)) # out += .5 * alpha * np.dot(w, w) # L2 w_extended_ = copy.copy(w_extended) if ignore2w == 0: reg_idx = list(range(0, idx - 1)) reg_idx2 = list(range(n_features, n_features + idx - 1)) w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. elif ignore2w == 1: reg_idx = list(range(idx, n_features)) reg_idx2 = list(range(n_features + idx, 2 * n_features)) w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. else: n_user = int((idx - 1) / 2) reg_idx = list(range(0, n_user)) + list(range(idx, n_features)) reg_idx2 = list(range(n_features, n_features + n_user)) + list( range(n_features + idx, 2 * n_features)) #reg_idx = list(range(0, idx-1)) + list(range(idx, n_features)) #reg_idx2 = list(range(n_features, n_features + idx-1)) + list(range(n_features + idx, 2 * n_features)) w_extended_[reg_idx] = 0. w_extended_[reg_idx2] = 0. if ignore2w > 0: w_ = w[idx:] w_ = np.transpose(w_.flatten().reshape(k, -1)) # print(w_.shape) Dsmooth = w_[1:, :] - w_[:-1, :] zero = np.zeros((1, k)) Dsmooth = np.concatenate((Dsmooth, zero), axis=0) Dsmooth = np.transpose(Dsmooth) Dsmooth = Dsmooth.flatten() Dsmooth_squared = Dsmooth * Dsmooth # out += alpha * w_extended.sum() out += C * w_extended_.sum() + D * Dsmooth_squared.sum( ) # L1, w_extended is non-negative z = expit(yz) z0 = (z - 1) * y grad = safe_sparse_dot(X.T, z0) grad = np.concatenate([grad, -grad]) # grad += alpha * w # L2 # grad += alpha + # L1 D_grad = np.zeros((n_features, )) D_grad[idx:] = Dsmooth D_grad = np.concatenate([D_grad, -D_grad]) grad += C - 2 * D * D_grad else: out += C + w_extended.sum() z = expit(yz) z0 = (z - 1) * y grad = safe_sparse_dot(X.T, z0) grad = np.concatenate([grad, -grad]) grad += C return out, grad