def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i,j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect)*d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect)* d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta
def hypergrad(outgrad): d_x = outgrad global v_current v = v_current d_alphas, d_gammas = np.zeros(alphas.shape), np.zeros(gammas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. beta = np.linspace(0.001, 0.999, N_safe_sampling) for i, alpha, gamma in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_gamma_vect = fill_parser(parser, gamma) x = (1 - beta[i]) * x_init + beta[i] * x_final x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final v = (np.subtract(x, x_previous)) / cur_alpha_vect # recover velocity for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], v[ixs]) g = L_grad(x, meta, i) # Evaluate gradient d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_gammas[i,j] = np.dot(d_v[ixs], v[ixs] + g[ixs]) d_x -= L_hvp_x(x, meta, (1.0 - cur_gamma_vect)*d_v, i) d_meta -= L_hvp_meta(x, meta, (1.0 - cur_gamma_vect)* d_v, i) d_v *= cur_gamma_vect # assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_gammas, d_meta
def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta
def sgd_parsed_mad(L_grad, hypers, parser, callback=None, forward_pass_only=True): """This version has alphas and betas be TxN_weight_types matrices. parser is a dict containing the indices for the different types of weights.""" x0, alphas, gammas, meta = hypers N_safe_sampling = len(alphas) x_init = np.copy(x0) x_current = np.copy(x0) global v_current v_current = np.zeros(x0.size) X, V = ExactRep(x0), ExactRep(np.zeros(x0.size)) iters = zip(range(len(alphas)), alphas, gammas) for i, alpha, gamma in iters: g = L_grad(X.val, meta, i) if callback: callback(X.val, V.val, g, i) cur_alpha_vect = fill_parser(parser, alpha) cur_gamma_vect = fill_parser(parser, gamma) V.mul(cur_gamma_vect).sub((1.0 - cur_gamma_vect) * g) X.add(cur_alpha_vect * V.val) x_final = X.val if forward_pass_only: return x_final def hypergrad(outgrad): d_x = outgrad global v_current v = v_current d_alphas, d_gammas = np.zeros(alphas.shape), np.zeros(gammas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. beta = np.linspace(0.001, 0.999, N_safe_sampling) for i, alpha, gamma in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_gamma_vect = fill_parser(parser, gamma) x = (1 - beta[i]) * x_init + beta[i] * x_final x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final v = (np.subtract(x, x_previous)) / cur_alpha_vect # recover velocity for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], v[ixs]) g = L_grad(x, meta, i) # Evaluate gradient d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_gammas[i,j] = np.dot(d_v[ixs], v[ixs] + g[ixs]) d_x -= L_hvp_x(x, meta, (1.0 - cur_gamma_vect)*d_v, i) d_meta -= L_hvp_meta(x, meta, (1.0 - cur_gamma_vect)* d_v, i) d_v *= cur_gamma_vect # assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_gammas, d_meta return x_final, [None, hypergrad]
def sgd_parsed(L_grad, hypers, parser, callback=None, forward_pass_only=True): """This version has alphas and betas be TxN_weight_types matrices. parser is a dict containing the indices for the different types of weights.""" x0, alphas, betas, meta = hypers X, V = ExactRep(x0), ExactRep(np.zeros(x0.size)) iters = zip(range(len(alphas)), alphas, betas) for i, alpha, beta in iters: g = L_grad(X.val, meta, i) if callback: callback(X.val, V.val, g, i) cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) V.mul(cur_beta_vect).sub((1.0 - cur_beta_vect) * g) X.add(cur_alpha_vect * V.val) x_final = X.val if forward_pass_only: return x_final def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div( cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta return x_final, [None, hypergrad]
def sgd_parsed(L_grad, hypers, parser, callback=None, forward_pass_only=True): """This version has alphas and betas be TxN_weight_types matrices. parser is a dict containing the indices for the different types of weights.""" x0, alphas, betas, meta = hypers X, V = ExactRep(x0), ExactRep(np.zeros(x0.size)) iters = zip(range(len(alphas)), alphas, betas) for i, alpha, beta in iters: g = L_grad(X.val, meta, i) if callback: callback(X.val, V.val, g, i) cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) V.mul(cur_beta_vect).sub((1.0 - cur_beta_vect) * g) X.add(cur_alpha_vect * V.val) x_final = X.val if forward_pass_only: return x_final def hypergrad(outgrad): d_x = outgrad d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape) d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape) grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d) L_hvp_x = grad(grad_proj, 0) # Returns a size(x) output. L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output. for i, alpha, beta in iters[::-1]: # build alpha and beta vector cur_alpha_vect = fill_parser(parser, alpha) cur_beta_vect = fill_parser(parser, beta) for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_alphas[i,j] = np.dot(d_x[ixs], V.val[ixs]) X.sub(cur_alpha_vect * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect) # Reverse momentum update d_v += d_x * cur_alpha_vect for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i,j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs]) d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect)*d_v, i) d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect)* d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta return x_final, [None, hypergrad]