def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr*rx er_decayed = er*re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None]*er_decayed[:, None, :] dws = (spikes*(v2-v1))/(rx*re-1) new_xr = xr_decayed + xs/(kp_x+kd_x) new_er = er_decayed + es/(kp_e+kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition)
def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2 * derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1, c2, profile=profile) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1, c2, profile=profile) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name='alphastar_c3'), name='alphastar_c2'), name='alphastar_c1') return ([alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds', cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_', TT.eq(nw_alpha1, zero), cond1, cond2, cond3)))
def armijo(alpha0, alpha1, phi_a0, phi_a1): factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0) a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \ alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0) a = a / factor b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \ alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0) b = b / factor alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a) phi_a2 = phi(alpha2) end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0 end_condition = TT.bitwise_or( TT.isnan(alpha2), end_condition) end_condition = TT.bitwise_or( TT.isinf(alpha2), end_condition) alpha2 = TT.switch( TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.), one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.), alpha2) return [alpha1, alpha2, phi_a1, phi_a2], \ theano.scan_module.until(end_condition)
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum( rx * re, t_start=t_last, t_end=0 ) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = ( xr[:, :, None] * er[:, None, :] * spikes ) * sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr * rx + xs / (kp_x + kd_x)) add_update(er, er * re + es / (kp_e + kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last - 1)) add_update(te_last, tt.switch(e_spikes, 0, te_last - 1)) return dw_es.sum(axis=0)
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x / (kp_x + kd_x) re = kd_e / (kp_e + kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr * rx er_decayed = er * re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None] * er_decayed[:, None, :] dws = (spikes * (v2 - v1)) / (rx * re - 1) new_xr = xr_decayed + xs / (kp_x + kd_x) new_er = er_decayed + es / (kp_e + kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None] * new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2*derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name = 'alphastar_c3'), name = 'alphastar_c2'), name ='alphastar_c1') return ( [alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds',cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3)))
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum(rx*re, t_start=t_last, t_end=0) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = (xr[:, :, None]*er[:, None, :]*spikes)*sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr*rx + xs/(kp_x+kd_x)) add_update(er, er*re + es/(kp_e+kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last-1)) add_update(te_last, tt.switch(e_spikes, 0, te_last-1)) return dw_es.sum(axis=0)
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile = False, mode=theano.Mode(linker='cvm')): """ TODO: re-write me Part of the optimization algorithm in `scalar_search_wolfe2`. a_lo : scalar (step size) a_hi : scalar (step size) phi_lo : scalar (value of f at a_lo) phi_hi : scalar ( value of f at a_hi) derphi_lo : scalar ( value of derivative at a_lo) phi : callable -> generates computational graph derphi: callable -> generates computational graph phi0 : scalar ( value of f at 0) derphi0 : scalar (value of the derivative at 0) c1 : scalar (wolfe parameter) c2 : scalar (wolfe parameter) profile: if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1*dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2*dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name = 'phi_rec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ( [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop) ) maxiter = n_iters delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # cubic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) # quadratic interpolant check phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2*dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='marec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] print'while_zoom' outs, updates = scan(while_zoom, states = states, n_steps = maxiter, name = 'while_zoom', mode = mode, profile = profile) print 'done_while' a_star = ifelse(onlyif, a_j , outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop))
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile=False): """ WRITEME Part of the optimization algorithm in `scalar_search_wolfe2`. Parameters ---------- a_lo : float Step size a_hi : float Step size phi_lo : float Value of f at a_lo phi_hi : float Value of f at a_hi derphi_lo : float Value of derivative at a_lo phi : callable Generates computational graph derphi : callable Generates computational graph phi0 : float Value of f at 0 derphi0 : float Value of the derivative at 0 c1 : float Wolfe parameter c2 : float Wolfe parameter profile : bool True if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop)) maxiter = n_iters # cubic interpolant check delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # quadratic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2 * dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q', TT.isnan(a_j), TT.bitwise_or(a_j > b - qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='marec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and('only_if', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # print'while_zoom' outs, updates = scan(while_zoom, states=states, n_steps=maxiter, name='while_zoom', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while' a_star = ifelse(onlyif, a_j, outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None]*n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out+eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout))*factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [oj + final_results[p] for oj, p in zip(args[1+n_params:1+2*n_params], model.params)] return [args[0] + const(1)] + nw_gs + nw_js ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0) for shp in model.params_shape] ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0) for shp in model.params_shape] idx0 = TT.unbroadcast(const([0]),0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]] nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']]) for x,y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const(options['cbs'])# * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres( compute_Gv, [x / norm_grads for x in self.gs], Ms = self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)]) norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)]) norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']]) for x,y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k], updates=updates, allow_input_downcast = True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [ -r for r in self.rs] nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates = dict(zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function([], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_cost_step, states = states, n_steps = n_steps, name='ls_cost_step', profile = options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_grad_step, states = states, n_steps = n_steps, name = 'ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']: (ebdx + 1) * options['ebs']]) for x,y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function( [lr, ebdx], fcost, givens = grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function( [lr, ebdx], fgrad, allow_input_downcast=True, givens = grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs']// options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone( model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0]))] rvals, _ = scan(ls_error, states = states, n_steps = n_steps, name='ls_err_step', mode=gpu_mode, profile = options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def init_gpu(self, options, channel, data, model): # Step 1. Compile function for computing eucledian gradients eps = numpy.float32(1e-24) gbdx = TT.iscalar('grad_batch_idx') n_params = len(self.model.params) print 'Constructing grad function' loc_inputs = [x.type() for x in model.inputs] srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, name='grad_loop', mode=gpu_mode, profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, self.shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), allow_input_downcast=True, name='compute_eucledian_gradients', mode=gpu_mode, profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) self.damping = theano.shared(numpy.float32(options['mreg'])) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], Ms=self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) reset = TT.scalar(dtype='int8', name='reset') norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)]) norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)]) norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)]) norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1 beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \ 2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2)) beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k) beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)), TT.constant(numpy.float32(0.)), beta_k) nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)] self.nwds = nwds nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \ numpy.float32(1e-25) updates.update(dict(zip(self.rs, nw_rs))) updates.update(dict(zip(self.ds, nwds))) updates[self.norm_km1km1] = norm_kk updates[self.norm_dkm1] = norm_dk updates[self.norm_d] = nw_normd print 'Compiling riemannian gradient function' cst = time.time() grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, self.shared_data)] self.compute_riemannian_gradients = theano.function( [reset, rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0, beta_k ], updates=updates, allow_input_downcast=True, givens=dict(grad_inps), name='compute_riemannian_gradients', mode=cpu_mode, on_unused_input='warn', profile=options['profile']) print 'Time to compile Riemannian', print_time(time.time() - cst) cst = time.time() # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') newparams = [p + lr * d for p, d in zip(model.params, self.ds)] nw_ds = [-r for r in self.rs] nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs])) self.update_params = theano.function([lr], updates=dict( zip(model.params, newparams)), name='update_params', on_unused_input='warn', allow_input_downcast=True, mode=gpu_mode, profile=options['profile']) self.reset_directions = theano.function( [], updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])), name='reset_dirs', on_unused_input='warn', mode=cpu_mode, allow_input_downcast=True, profile=options['profile']) n_steps = options['ebs'] // options['cbs'] def ls_cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_cost_step, states=states, n_steps=n_steps, name='ls_cost_step', profile=options['profile']) fcost = rvals[1][0] / const(n_steps) def ls_grad_step(_idx, gws): idx = TT.cast(_idx, 'int32') nw_inps = [ x[idx * options['cbs']:(idx + 1) * options['cbs']] for x in loc_inputs ] replace = dict( zip(model.inputs + model.params, nw_inps + newparams)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_gs = TT.grad(nw_cost, lr) return _idx + numpy.float32(1), gws + nw_gs states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_grad_step, states=states, n_steps=n_steps, name='ls_grad_step', profile=options['profile']) fgrad = rvals[1][0] / const(n_steps) ebdx = TT.iscalar('ebdx') grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, self.shared_data)] self.ls_cost_fn = theano.function([lr, ebdx], fcost, givens=grad_inps, allow_input_downcast=True, name='ls_cost_fn', mode=gpu_mode, profile=options['profile']) self.approx_change = theano.function( [lr], -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]), allow_input_downcast=True, name='approx_change', mode=gpu_mode, profile=options['profile']) self.ls_grad_fn = theano.function([lr, ebdx], fgrad, allow_input_downcast=True, givens=grad_inps, name='ls_grad_fn', mode=gpu_mode, profile=options['profile']) self.old_score = 50000 n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=cpu_mode, allow_input_downcast=True, on_unused_input='warn', profile=options['profile']) print 'Compile eval time', print_time(time.time() - cst) self.old_cost = 1e6 self.options = options self.perm = self.rng.permutation(4) self.pos = 0
def logic_or(x, y): return T.bitwise_or(x, y)
def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1*dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2*dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name = 'phi_rec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ( [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop) )
def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params: 1 * n_params] r1s = args[1 * n_params: 2 * n_params] r2s = args[2 * n_params: 3 * n_params] r3s = args[3 * n_params: 4 * n_params] dls = args[4 * n_params: 5 * n_params] ds = args[5 * n_params: 6 * n_params] betal = beta beta = betan vs = [r3 / beta for r3 in r3s] r3s, upds = compute_Av(*vs) r3s = [r3 - shift * v for r3, v in zip(r3s, vs)] r3s = [TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1, r3) for r3, r1 in zip(r3s, r1s)] alpha = inner_product(r3s, vs) r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3 / M for r3, M in zip(r3s, Ms)] betan = sqrt_inner_product(r2s, r3s) else: betan = sqrt_inner_product(r3s) pnorml = pnorm pnorm = TT.switch(TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs * dbar + sn * alpha gbar = sn * dbar - cs * alpha eplnn = sn * betan dbarn = -cs * betan ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs * phi phi = sn * phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [TT.switch(TT.neq(gamma, constantX(0.)), (v - epln * dl2 - dlta * dl) / gamma, v) for v, dl2, dl in zip(vs, dl2s, dls)] d_norm = TT.switch(TT.neq(gamma, constantX(0.)), sqrt_inner_product(ds), constantX(numpy.inf)) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau * d for x, d in zip(xs, ds)] xnorm = sqrt_inner_product(xs) xs = [TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2, x in zip(dl2s, xs)] flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, constantX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm), relrnorm) Tnorm = TT.switch(flag_no_6, TT.switch(TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml * rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = constantX(1) + relrnorm t2 = constantX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag, constantX(6))), multiple_switch(TT.le(t1, constantX(1)), constantX(3), TT.le(t2, constantX(1)), constantX(4), TT.le(relrnorm, rtol), constantX(1), TT.le(Anorm, constantX(1e-20)), constantX(12), TT.le(relArnorml, rtol), constantX(10), TT.ge(epsx, beta1), constantX(5), TT.ge(xnorm, maxxnorm), constantX(6), TT.ge(niter, TT.cast(maxit, theano.config.floatX)), constantX(8), flag), flag) flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.), flag) return [niter + constantX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, upds, \ theano.scan_module.scan_utils.until(TT.neq(flag, 0))
def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params: 1 * n_params] r1s = args[1 * n_params: 2 * n_params] r2s = args[2 * n_params: 3 * n_params] r3s = args[3 * n_params: 4 * n_params] dls = args[4 * n_params: 5 * n_params] ds = args[5 * n_params: 6 * n_params] betal = beta beta = betan vs = [r3/beta for r3 in r3s] r3s = compute_Av(*vs) r3s = [r3 + damp*v for r3,v in zip(r3s, vs)] r3s = [TT.switch(TT.ge(niter, numpy.float64(1.)), r3 - (beta/betal)*r1, r3) for r3, r1 in zip(r3s, r1s)] alpha = sqnorm(r3s, vs) r3s = [r3 - (alpha/beta)*r2 for r3,r2 in zip(r3s,r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3/M for r3, M in zip(r3s, Ms)] betan = norm(r2s, r3s) else: betan = norm(r3s) pnorml = pnorm pnorm = TT.switch(TT.eq(niter, npy_floatX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs*dbar + sn*alpha gbar = sn*dbar - cs*alpha eplnn = sn*betan dbarn = - cs*betan; ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs*phi phi = sn*phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [TT.switch(TT.neq(gamma, npy_floatX(0.)), (v - epln*dl2 - dlta*dl)/gamma, v) for v,dl2,dl in zip(vs,dl2s, dls)] d_norm = TT.switch(TT.neq(gamma,npy_floatX(0.)), norm(ds), TT.constant((npy_floatX(numpy.inf)))) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau*d for x,d in zip(xs,ds)] xnorm = norm(xs) xs = [TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2,x in zip(dl2s,xs)] flag = TT.switch(TT.ge(xnorm, maxxnorm), npy_floatX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, npy_floatX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, norm(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm*xnorm + bnorm), relrnorm) Tnorm = TT.switch(flag_no_6, TT.switch(TT.eq(niter, npy_floatX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml*rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = npy_floatX(1) + relrnorm t2 = npy_floatX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, npy_floatX(0.)), TT.eq(flag, npy_floatX(6.))), TT.switch(TT.le(t1, npy_floatX(1.)), npy_floatX(3.), TT.switch(TT.le(t2, npy_floatX(1.)), npy_floatX(4.), TT.switch(TT.le(relrnorm, rtol), npy_floatX(1.), TT.switch(TT.le(Anorm, npy_floatX(1e-20)), npy_floatX(12), TT.switch(TT.le(relArnorml, rtol), npy_floatX(10.), TT.switch(TT.ge(epsx, beta1), npy_floatX(5.), TT.switch(TT.ge(xnorm, maxxnorm), npy_floatX(6.), TT.switch(TT.ge(niter, TT.cast(maxiter,floatX)), npy_floatX(8.), flag)))))))), flag) flag = TT.switch(TT.lt(Axnorm, rtol*Anorm*xnorm), npy_floatX(11.), flag) return [ niter + npy_floatX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, \ theano.scan_module.scan_utils.until(TT.neq(flag,0))
def fmin_cg_loop(old_fval, old_old_fval, *rest): xks = rest[:n_elems] gfks = rest[n_elems:n_elems * 2] maxs = [ abs(gfk).max(axis=range(gfk.ndim)) for gfk in gfks ] if len(maxs) == 1: gnorm = maxs[0] else: gnorm = TT.maximum(maxs[0], maxs[1]) for dx in maxs[2:]: gnorm = TT.maximum(gnorm, dx) pks = rest[n_elems*2:] deltak = sum((gfk * gfk).sum() for gfk in gfks) old_fval_backup = old_fval old_old_fval_backup = old_old_fval alpha_k, old_fval, old_old_fval, derphi0, nw_gfks = \ linesearch.line_search_wolfe2(f,myfprime, xks, pks, old_fval_backup, old_old_fval_backup, profile = profile, gfks = gfks) xks = [ ifelse(gnorm <= gtol, xk, ifelse(TT.bitwise_or(TT.isnan(alpha_k), TT.eq(alpha_k, zero)), xk, xk+alpha_k*pk)) for xk, pk in zip(xks,pks)] gfkp1s_tmp = myfprime(*xks) gfkp1s = [ ifelse(TT.isnan(derphi0), nw_x, x) for nw_x, x in zip(gfkp1s_tmp, nw_gfks)] yks = [gfkp1 - gfk for gfkp1, gfk in izip(gfkp1s, gfks)] # Polak-Ribiere formula. beta_k = TT.maximum( zero, sum((x * y).sum() for x, y in izip(yks, gfkp1s)) / deltak) pks = [ ifelse(gnorm <= gtol, pk, ifelse(TT.bitwise_or(TT.isnan(alpha_k), TT.eq(alpha_k, zero)), pk, -gfkp1 + beta_k * pk)) for gfkp1,pk in zip(gfkp1s,pks) ] gfks = [ifelse(gnorm <= gtol, gfk, ifelse( TT.bitwise_or(TT.isnan(alpha_k), TT.eq(alpha_k, zero)), gfk, gfkp1)) for (gfk, gfkp1) in izip(gfks, gfkp1s)] stop = lazy_or(gnorm <= gtol, TT.bitwise_or(TT.isnan(alpha_k), TT.eq(alpha_k, zero)))# warnflag = 2 old_fval = ifelse(gnorm >gtol, old_fval, old_fval_backup) old_old_fval = ifelse(gnorm >gtol, old_old_fval, old_old_fval_backup) return ([old_fval, old_old_fval]+xks + gfks + pks, until(stop))