def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop( hid_sig, self.params, TT.Rop(hid_sig, self.params, args) / ((1 - hid_sig) * hid_sig * self.batchsize)) nw_args2 = TT.Lop( hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args) / (hid_sftmax * self.batchsize)) fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {}
def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop(hid_sig, self.params, TT.Rop(hid_sig, self.params, args)/((1-hid_sig)*hid_sig*self.batchsize)) nw_args2 = TT.Lop(hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args)/(hid_sftmax*self.batchsize)) fin_vals = [x+y for x,y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {}
def __init__(self, model, state, data): """ Parameters: :param model: Class describing the model used. It should provide the computational graph to evaluate the model :param state: Dictionary containing the current state of your job. This includes configuration of the job, specifically the seed, the startign damping factor, batch size, etc. See main.py for details :param data: Class describing the dataset used by the model """ ##################################### # Step 0. Constructs shared variables ##################################### n_params = len(model.params) cbs = state['cbs'] bs = state['bs'] ebs = state['ebs'] mbs = state['mbs'] profile = state['profile'] self.model = model self.rng = numpy.random.RandomState(state['seed']) self.damping = theano.shared(numpy.float32(state['damp'])) self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_inps = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_outs = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.step = 0 self.cbs = cbs self.bs = bs self.ebs = ebs self.mbs = mbs self.state = state self.profile = profile self.data = data self.step_timer = time.time() ############################################################ # Step 1. Compile function for computing eucledian gradients ############################################################ print 'Constructing grad function' bdx = TT.iscalar('batch_idx') loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in self.data.variables] cost = safe_clone(model.train_cost, model.inputs, loc_data) gs = TT.grad(cost, model.params) ratio = numpy.float32(float(bs) / cbs) update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)] print 'Compiling grad function' st = time.time() self.loc_grad_fn = theano.function( [bdx ], [], updates=update, name='loc_fn_grad', profile=profile) print 'took', time.time() - st ############################################################# # Step 2. Compile function for Computing Riemannian gradients ############################################################# loc_x = self.data._natgrad[bdx*cbs: (bdx+1)*cbs] loc_y = self.data._natgrady[bdx*cbs:(bdx+1)*cbs] loc_Gvs = safe_clone(model.Gvs(*self.loop_inps), [model.X, model.Y], [loc_x, loc_y]) updates = [(l, l + lg) for l, lg in zip(self.loop_outs, loc_Gvs)] st = time.time() loc_Gv_fn = theano.function( [bdx], [], updates=updates, name='loc_fn_rop', profile=profile) print 'took', time.time() - st def compute_Gv(*args): rval = forloop(loc_Gv_fn, mbs // cbs, self.loop_inps, self.loop_outs)(*args) return rval, {} print 'Constructing riemannian gradient function' st = time.time() norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) if not state['minresQLP']: self.msgs = minres_messages rvals = minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=state['mrtol'], damp=self.damping, maxit=state['miters'], profile=state['profile']) else: self.msgs = minresQLP_messages[1:] rvals = minresQLP(compute_Gv, [x / norm_grads for x in self.gs], model.params_shape, rtol=state['mrtol'], damp=self.damping, maxit=state['miters'], TranCond=state['trancond'], profile=state['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = TT.cast(rvals[1], 'int32') niters = rvals[2] rel_residual = rvals[3] Anorm = rvals[4] Acond = rvals[5] norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs)) norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates = zip(self.rs, nw_rs) print 'took', time.time() - st print 'Compiling riemannian gradient function' st = time.time() self.compute_natural_gradients = theano.function( [], [flag, niters, rel_residual, Anorm, Acond, norm_grads, norm_rs_grads, norm_ord0], updates=updates, allow_input_downcast = True, name='compute_riemannian_gradients', on_unused_input='warn', profile=profile) print 'took', time.time() - st ########################################################### # Step 3. Compile function for evaluating cost and updating # parameters ########################################################### print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(state['lr']) loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in self.data.variables] old_cost = safe_clone(model.train_cost, model.inputs, loc_data) self.loc_old_cost = theano.function( [bdx], old_cost, name='loc_old_cost', profile=profile) new_params = [p - lr * r for p, r in zip(model.params, self.rs)] new_cost = safe_clone(model.train_cost, model.inputs + model.params, loc_data + new_params) new_err = safe_clone(model.error, model.inputs + model.params, loc_data + new_params) self.loc_new_cost = theano.function( [bdx, lr], [new_cost, new_err], name='loc_new_cost', profile=profile) self.lr = numpy.float32(state['lr']) updates = dict(zip(model.params, new_params)) model.dbm_class.censor_updates(updates) self.update_params = theano.function( [lr], [], updates=updates, name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') p_norm = TT.scalar('p_norm') prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \ # TT.sqrt(sum(TT.sum(r*r) for r in self.rs)) dist = -lr * prod angle = prod / p_norm rho = (new_cost - old_cost) / dist self.compute_rho = theano.function( [old_cost, new_cost, lr, p_norm], [rho, dist, angle], name='compute_rho', profile=profile) self.old_cost = 1e20 self.__new_cost = 0 self.__error = 0 self.return_names = ['cost', 'old_cost', 'error', 'time_grads', 'time_metric', 'time_eval', 'minres_flag', 'minres_iters', 'minres_relres', 'minres_Anorm', 'minres_Acond', 'norm_ord0', 'norm_grad', 'norm_nat', 'lr', 'grad_angle', #'r_g', #'icost', 'damping', 'rho' ]
def __init__(self, model, state, data): """ Parameters: :param model: Class describing the model used. It should provide the computational graph to evaluate the model :param state: Dictionary containing the current state of your job. This includes configuration of the job, specifically the seed, the startign damping factor, batch size, etc. See main.py for details :param data: Class describing the dataset used by the model """ ##################################### # Step 0. Constructs shared variables ##################################### n_params = len(model.params) cbs = state['cbs'] bs = state['bs'] mbs = state['mbs'] ebs = state['ebs'] profile = state['profile'] self.model = model self.rng = numpy.random.RandomState(state['seed']) srng = RandomStreams(self.rng.randint(213)) self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_inps = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.loop_outs = [theano.shared( numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape] self.step = 0 self.cbs = cbs self.bs = bs self.mbs = mbs self.ebs = ebs self.state = state self.profile = profile self.data = data self.step_timer = time.time() ############################################################ # Step 1. Compile function for computing eucledian gradients ############################################################ print 'Constructing grad function' bdx = TT.iscalar('batch_idx') loc_data = [x(bdx * cbs, (bdx + 1) * cbs) for x in self.data.variables] cost = safe_clone(model.train_cost, model.inputs, loc_data) gs = TT.grad(cost, model.params) ratio = numpy.float32(float(bs) / cbs) update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)] print 'Compiling grad function' st = time.time() self.loc_grad_fn = theano.function( [bdx], [], updates=update, name='loc_fn_grad', profile=profile) print 'took', time.time() - st norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) ########################################################### # Step 3. Compile function for evaluating cost and updating # parameters ########################################################### print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(state['lr']) loc_data = [x(bdx * cbs, (bdx + 1) * cbs) for x in self.data.variables] old_cost = safe_clone(model.train_cost, model.inputs, loc_data) self.loc_old_cost = theano.function( [bdx], old_cost, name='loc_old_cost', profile=profile) new_params = [p - lr * r for p, r in zip(model.params, self.gs)] new_cost = safe_clone(model.train_cost, model.inputs + model.params, loc_data + new_params) new_err = safe_clone(model.error, model.inputs + model.params, loc_data + new_params) self.loc_new_cost = theano.function( [bdx, lr], [new_cost, new_err], name='loc_new_cost', profile=profile) loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in self.data.eval_variables] new_cost = safe_clone(model.train_cost, model.inputs + model.params, loc_data + new_params) new_err = safe_clone(model.error, model.inputs + model.params, loc_data + new_params) self.loc_new_cost_all = theano.function( [bdx, lr], [new_cost, new_err], name='loc_new_cost', profile=profile) self.update_params = theano.function( [lr], [], updates=zip(model.params, new_params), name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') dist = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.gs)]) rho = (new_cost - old_cost) / dist self.compute_rho = theano.function( [old_cost, new_cost, lr], [rho, norm_grads], name='compute_rho', profile=profile) self.old_cost = 1e20 self.return_names = ['cost', 'error', 'time_grads', 'time_eval', 'norm_grad', 'rho', 'lr']
def __init__(self, model, state, data): """ Parameters: :param model: Class describing the model used. It should provide the computational graph to evaluate the model :param state: Dictionary containing the current state of your job. This includes configuration of the job, specifically the seed, the startign damping factor, batch size, etc. See main.py for details :param data: Class describing the dataset used by the model """ ##################################### # Step 0. Constructs shared variables ##################################### n_params = len(model.params) bs = state['bs'] profile = state['profile'] self.model = model self.rng = numpy.random.RandomState(state['seed']) self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.bs = bs self.state = state self.profile = profile self.data = data self.data.set_iterator(order='sequence', rng=self.rng, batchsize=self.state['bs']) self.data_iter = data.__iter__() self.step_timer = time.time() ############################################################ # Step 1. Compile function for computing eucledian gradients ############################################################ print 'Constructing grad function' gs = TT.grad(self.model.train_cost, model.params) update = [(g, lg) for g, lg in zip(self.gs, gs)] print 'Compiling grad function' st = time.time() self.grad_fn = theano.function(self.model.inputs, [], updates=update, name='loc_fn_grad', profile=profile) print 'took', time.time() - st norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) ########################################################### # Step 3. Compile function for evaluating cost and updating # parameters ########################################################### print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(state['lr']) old_cost = model.train_cost self.compute_old_cost = theano.function(self.model.inputs, old_cost, name='loc_old_cost', profile=profile) new_params = [p - lr * r for p, r in zip(model.params, self.gs)] new_cost = safe_clone(model.train_cost, model.params, new_params) new_err = safe_clone(model.error, model.params, new_params) self.compute_new_cost = theano.function([lr] + self.model.inputs, [new_cost, new_err], name='loc_new_cost', profile=profile) self.update_params = theano.function([lr], [], updates=zip( model.params, new_params), name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') dist = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.gs)]) rho = (new_cost - old_cost) / dist self.compute_rho = theano.function([old_cost, new_cost, lr], [rho, norm_grads], name='compute_rho', profile=profile) self.old_cost = 1e20 self.step = 0 self.return_names = [ 'cost', 'error', 'time_grads', 'time_eval', 'norm_grad', 'rho', 'lr' ]
def __init__(self, X, Y, dbm, cost, batchsize=200, init_damp = 5., min_damp = .001, damp_ratio = 5./4., mrtol = 1e-4, miters = 100, trancond = 1e-4, lr = .1, adapt_rho = 1): """ X: theano design matrix of inputs Y: theano design matrix of features batchsize: int, describing the batch size init_damp: float, initial damping value min_damp: float, minimal damping value allowed damp_ratio: float, ratio used to increase damping (we decrease by 1./ratio) mrtol: float, relative tolerance error for the inversion of the metric miters: int, maximal number of iteration for minres trancond: float, (ignore) threshold for switching from MinresQLP to Minres lr : float/shared variable; learning rate adapt_rho : 0 or 1, if the damping should be heuristically adapted """ self.batchsize = batchsize self.adapt_rho = adapt_rho self.damp_ratio = damp_ratio self.min_damp = min_damp self.dbm = dbm self.cost = cost self.X = X self.Y = Y descr = self.cost.get_fixed_var_descr(self.dbm, X, Y) self._on_load_batch = descr.on_load_batch[0] self.drop_mask = descr.fixed_vars['drop_mask'] self.drop_mask_Y = descr.fixed_vars['drop_mask_Y'] self.params = self.get_params() self.params_shape = [x.get_value(borrow=True).shape for x in self.params] self.damping = theano.shared(numpy.float32(init_damp)) self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in self.params_shape] self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in self.params_shape] cost = self.get_cost() gs = TT.grad(cost, self.params) self.loc_grad_fn = theano.function([self.X, self.Y], [], updates = zip(self.gs, gs), name = 'loc_fn_grad') ### ### ### ### ### ### ### self.loc_x = theano.shared(numpy.zeros((20,784), dtype='float32')) self.loc_y = theano.shared(numpy.zeros((20,10), dtype='float32')) def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop(hid_sig, self.params, TT.Rop(hid_sig, self.params, args)/((1-hid_sig)*hid_sig*self.batchsize)) nw_args2 = TT.Lop(hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args)/(hid_sftmax*self.batchsize)) fin_vals = [x+y for x,y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {} norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs)) self.msgs = minresQLP_messages[1:] rvals = minresQLP(compute_Gv, [x / norm_grads for x in self.gs], self.params_shape, rtol=mrtol, damp=self.damping, maxit=miters, TranCond=trancond) nw_rs = [x * norm_grads for x in rvals[0]] flag = TT.cast(rvals[1], 'int32') niters = rvals[2] rel_residual = rvals[3] Anorm = rvals[4] Acond = rvals[5] norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs)) norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates = zip(self.rs, nw_rs) self.compute_natural_gradients = theano.function( [], [flag, niters, rel_residual, Anorm, Acond, norm_grads, norm_rs_grads, norm_ord0], updates=updates, allow_input_downcast = True, name='compute_riemannian_gradients', on_unused_input='warn') self.loc_old_cost = theano.function( [self.X, self.Y], cost, name='loc_old_cost') new_params = [p - lr * r for p, r in zip(self.params, self.rs)] new_cost = safe_clone(cost, self.params, new_params) new_err = safe_clone(cost, self.params, new_params) self.loc_new_cost = theano.function( [self.X, self.Y], [new_cost, new_err], name='loc_new_cost') updates = dict(zip(self.params, new_params)) self.censor_updates(updates) self.update_params = theano.function( [], [], updates=updates, name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') p_norm = TT.scalar('p_norm') prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \ # TT.sqrt(sum(TT.sum(r*r) for r in self.rs)) dist = -lr * prod angle = prod / p_norm rho = (new_cost - old_cost) / dist self.compute_rho = theano.function( [old_cost, new_cost, p_norm], [rho, dist, angle], name='compute_rho')
def __init__(self, X, Y, dbm, cost, batchsize=200, init_damp=5., min_damp=.001, damp_ratio=5. / 4., mrtol=1e-4, miters=100, trancond=1e-4, lr=.1, adapt_rho=1): """ X: theano design matrix of inputs Y: theano design matrix of features batchsize: int, describing the batch size init_damp: float, initial damping value min_damp: float, minimal damping value allowed damp_ratio: float, ratio used to increase damping (we decrease by 1./ratio) mrtol: float, relative tolerance error for the inversion of the metric miters: int, maximal number of iteration for minres trancond: float, (ignore) threshold for switching from MinresQLP to Minres lr : float/shared variable; learning rate adapt_rho : 0 or 1, if the damping should be heuristically adapted """ self.batchsize = batchsize self.adapt_rho = adapt_rho self.damp_ratio = damp_ratio self.min_damp = min_damp self.dbm = dbm self.cost = cost self.X = X self.Y = Y descr = self.cost.get_fixed_var_descr(self.dbm, X, Y) self._on_load_batch = descr.on_load_batch[0] self.drop_mask = descr.fixed_vars['drop_mask'] self.drop_mask_Y = descr.fixed_vars['drop_mask_Y'] self.params = self.get_params() self.params_shape = [ x.get_value(borrow=True).shape for x in self.params ] self.damping = theano.shared(numpy.float32(init_damp)) self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in self.params_shape ] self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in self.params_shape ] cost = self.get_cost() gs = TT.grad(cost, self.params) self.loc_grad_fn = theano.function([self.X, self.Y], [], updates=zip(self.gs, gs), name='loc_fn_grad') ### ### ### ### ### ### ### self.loc_x = theano.shared(numpy.zeros((20, 784), dtype='float32')) self.loc_y = theano.shared(numpy.zeros((20, 10), dtype='float32')) def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop( hid_sig, self.params, TT.Rop(hid_sig, self.params, args) / ((1 - hid_sig) * hid_sig * self.batchsize)) nw_args2 = TT.Lop( hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args) / (hid_sftmax * self.batchsize)) fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {} norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) self.msgs = minresQLP_messages[1:] rvals = minresQLP(compute_Gv, [x / norm_grads for x in self.gs], self.params_shape, rtol=mrtol, damp=self.damping, maxit=miters, TranCond=trancond) nw_rs = [x * norm_grads for x in rvals[0]] flag = TT.cast(rvals[1], 'int32') niters = rvals[2] rel_residual = rvals[3] Anorm = rvals[4] Acond = rvals[5] norm_rs_grads = TT.sqrt(sum(TT.sum(x**2) for x in nw_rs)) norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates = zip(self.rs, nw_rs) self.compute_natural_gradients = theano.function( [], [ flag, niters, rel_residual, Anorm, Acond, norm_grads, norm_rs_grads, norm_ord0 ], updates=updates, allow_input_downcast=True, name='compute_riemannian_gradients', on_unused_input='warn') self.loc_old_cost = theano.function([self.X, self.Y], cost, name='loc_old_cost') new_params = [p - lr * r for p, r in zip(self.params, self.rs)] new_cost = safe_clone(cost, self.params, new_params) new_err = safe_clone(cost, self.params, new_params) self.loc_new_cost = theano.function([self.X, self.Y], [new_cost, new_err], name='loc_new_cost') updates = dict(zip(self.params, new_params)) self.censor_updates(updates) self.update_params = theano.function([], [], updates=updates, name='update_params') old_cost = TT.scalar('old_cost') new_cost = TT.scalar('new_cost') p_norm = TT.scalar('p_norm') prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \ # TT.sqrt(sum(TT.sum(r*r) for r in self.rs)) dist = -lr * prod angle = prod / p_norm rho = (new_cost - old_cost) / dist self.compute_rho = theano.function([old_cost, new_cost, p_norm], [rho, dist, angle], name='compute_rho')