def gradient_descent(X, y, max_steps=100, tol=1e-14): '''Michael Grant's implementation of Gradient Descent.''' n, p = X.shape firstBacktrackMult = 0.1 nextBacktrackMult = 0.5 armijoMult = 0.1 stepGrowth = 1.25 stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) y_local = y.compute() for k in range(max_steps): # how necessary is this recalculation? if k % recalcRate == 0: Xbeta = X.dot(beta) eXbeta = da.exp(Xbeta) func = da.log1p(eXbeta).sum() - y.dot(Xbeta) e1 = eXbeta + 1.0 gradient = X.T.dot(eXbeta / e1 - y) Xgradient = X.dot(gradient) Xbeta, eXbeta, func, gradient, Xgradient = da.compute( Xbeta, eXbeta, func, gradient, Xgradient) # backtracking line search lf = func stepSize, beta, Xbeta, func = compute_stepsize(beta, gradient, Xbeta, Xgradient, y_local, func, **{ 'backtrackMult': backtrackMult, 'armijoMult': armijoMult, 'stepSize': stepSize}) if stepSize == 0: print('No more progress') break # necessary for gradient computation eXbeta = exp(Xbeta) df = lf - func df /= max(func, lf) if df < tol: print('Converged') break stepSize *= stepGrowth backtrackMult = nextBacktrackMult return beta
def loglikelihood(self, Xbeta, y): """ Evaluate the logistic loglikelihood Parameters ---------- Xbeta : array, shape (n_samples, n_features) y : array, shape (n_samples) """ enXbeta = exp(-Xbeta) return (Xbeta + log1p(enXbeta)).sum() - dot(y, Xbeta)
def predict(self, X): """Predict count for samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples,] Predicted count for each sample """ X_ = self._check_array(X) return exp(dot(X_, self._coef))
def make_poisson(n_samples=1000, n_features=100, n_informative=2, scale=1.0, chunksize=100, is_sparse=False): """ Generate a dummy dataset for modeling count data. Parameters ---------- n_samples : int number of rows in the output array n_features : int number of columns (features) in the output array n_informative : int number of features that are correlated with the outcome scale : float Scale the true coefficient array by this chunksize : int Number of rows per dask array block. is_sparse: bool Return a sparse matrix Returns ------- X : dask.array, size ``(n_samples, n_features)`` y : dask.array, size ``(n_samples,)`` array of non-negative integer-valued data Examples -------- >>> X, y = make_classification() >>> X dask.array<da.random.normal, shape=(1000, 100), dtype=float64, chunksize=(100, 100)> >>> y dask.array<da.random.poisson, shape=(1000,), dtype=int64, chunksize=(100,)> """ X = da.random.normal(0, 1, size=(n_samples, n_features), chunks=(chunksize, n_features)) if is_sparse: X = X.map_blocks(sparse.COO) informative_idx = np.random.choice(n_features, n_informative) beta = (np.random.random(n_features) - 1) * scale z0 = X[:, informative_idx].dot(beta[informative_idx]) rate = exp(z0) y = da.random.poisson(rate, size=1, chunks=(chunksize, )) return X, y
def make_poisson(n_samples=1000, n_features=100, n_informative=2, scale=1.0, chunksize=100): X = da.random.normal(0, 1, size=(n_samples, n_features), chunks=(chunksize, n_features)) informative_idx = np.random.choice(n_features, n_informative) beta = (np.random.random(n_features) - 1) * scale z0 = X[:, informative_idx].dot(beta[informative_idx]) rate = exp(z0) y = da.random.poisson(rate, size=1, chunks=(chunksize, )) return X, y
def gradient(self, Xbeta, X, y): eXbeta = exp(Xbeta) return dot(X.T, eXbeta - y)
def loglikelihood(self, Xbeta, y): eXbeta = exp(Xbeta) yXbeta = y * Xbeta return (eXbeta - yXbeta).sum()
def hessian(self, Xbeta, X): eXbeta = exp(Xbeta) x_diag_eXbeta = eXbeta[:, None] * X return dot(X.T, x_diag_eXbeta)
def bfgs(X, y, max_iter=500, tol=1e-14, family=Logistic): '''Simple implementation of BFGS.''' n, p = X.shape y = y.squeeze() recalcRate = 10 stepSize = 1.0 armijoMult = 1e-4 backtrackMult = 0.5 stepGrowth = 1.25 beta = np.zeros(p) Hk = np.eye(p) for k in range(max_iter): if k % recalcRate == 0: Xbeta = X.dot(beta) eXbeta = exp(Xbeta) func = log1p(eXbeta).sum() - dot(y, Xbeta) e1 = eXbeta + 1.0 gradient = dot(X.T, eXbeta / e1 - y) # implicit numpy -> dask conversion if k: yk = yk + gradient # TODO: gradient is dasky and yk is numpy-y rhok = 1 / yk.dot(sk) adj = np.eye(p) - rhok * dot(sk, yk.T) Hk = dot(adj, dot(Hk, adj.T)) + rhok * dot(sk, sk.T) step = dot(Hk, gradient) steplen = dot(step, gradient) Xstep = dot(X, step) # backtracking line search lf = func old_Xbeta = Xbeta stepSize, _, _, func = compute_stepsize_dask( beta, step, Xbeta, Xstep, y, func, family=family, backtrackMult=backtrackMult, armijoMult=armijoMult, stepSize=stepSize) beta, stepSize, Xbeta, gradient, lf, func, step, Xstep = persist( beta, stepSize, Xbeta, gradient, lf, func, step, Xstep) stepSize, lf, func, step = compute(stepSize, lf, func, step) beta = beta - stepSize * step # tiny bit of repeat work here to avoid communication Xbeta = Xbeta - stepSize * Xstep if stepSize == 0: print('No more progress') break # necessary for gradient computation eXbeta = exp(Xbeta) yk = -gradient sk = -stepSize * step stepSize *= stepGrowth if stepSize == 0: print('No more progress') break df = lf - func df /= max(func, lf) if df < tol: print('Converged') break return beta
def loglike(Xbeta, y): eXbeta = exp(Xbeta) yXbeta = y * Xbeta return (eXbeta - yXbeta).sum()
def loglike(Xbeta, y): eXbeta = exp(Xbeta) return (log1p(eXbeta)).sum() - dot(y, Xbeta)
def sigmoid(x): return 1 / (1 + exp(-x))