def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ # this is how if-then-else is written in Theano tilde_x = T.switch(T.gt(corruption_level, 0), self.get_corrupted_input(self.x, corruption_level), self.x) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) act = T.dot(tilde_x, self.W) + self.b # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch L = T.sqrt(T.sum(T.sqr(T.sub(self.x, z)), axis=1)) reg = T.sum(y, axis=0) / T.shape(y)[0] # sum over training set rho = T.constant(0.05) beta = T.constant(self.beta) reg1 = T.sum(rho * T.log(rho / reg) + (1-rho) * T.log((1-rho) / (1-reg))) cost = T.mean(L) + beta * reg1 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - learning_rate * gparam return (cost, collections.OrderedDict(updates.items()))
def compute_output(self, network): hyperparameter_name = network.find_hyperparameter(["hyperparameter"]) # TODO add default hyperparameter res = network.find_hyperparameter([hyperparameter_name]) if utils.is_number(res): var = T.constant(res) shape = () elif utils.is_ndarray(res): var = T.constant(res) shape = res.shape elif utils.is_shared_variable(res): var = res shape = res.get_value().shape elif utils.is_nonshared_variable(res): var = res if res.ndim == 0: shape = () else: shape = network.find_hyperparameter(["shape"]) else: raise ValueError("Unknown hyperparameter type of %s" % res) network.create_vw( "default", variable=var, shape=shape, tags={"output"}, )
def test_alloc_memset_0(): i = tensor.iscalar() z = numpy.zeros((1,), dtype='float32') o = numpy.ones((1,), dtype='float32') ones = numpy.ones((2,), dtype='float32') # Test with 0 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 0).all() # Test with 1 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 1).all() # Test with 1, 1 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(2)) == 1).all()
def __init__(self, input_dim, N, init_scale=2.0): """ A zoomable attention window for 1-dimensional inputs. Parameters ---------- input_dim : int length of the input vectors N : length of the attention window init_scale : initial scaling for inputs vs. attention window """ self.input_dim = input_dim self.N = N self.init_scale = init_scale # make offsets for internal dispersement of grid points. # -- internal grid coordinates range over [-1...+1] offsets = np.arange(N) - (N / 2.0) + 0.5 offsets = offsets / np.max(offsets) offsets = offsets.astype(theano.config.floatX) self.grid_offsets = T.constant(offsets) # make coordinate vectors for location in the input. # -- coordinates for the smallest dimension are scaled to range over # [-init_scale....init_scale]. x_coords = (np.arange(input_dim) - (input_dim / 2.0) + 0.5) x_coords = (init_scale / np.max(x_coords)) * x_coords x_coords = x_coords.astype(theano.config.floatX) self.x_coords = T.constant(x_coords) return
def hard_sigmoid(x): out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype slope = T.constant(0.2, dtype=out_dtype) shift = T.constant(0.5, dtype=out_dtype) x = (x * slope) + shift x = T.clip(x, 0, 1) return x
def generate_forward_diffusion_sample(self, X_noiseless): """ Corrupt a training image with t steps worth of Gaussian noise, and return the corrupted image, as well as the mean and covariance of the posterior q(x^{t-1}|x^t, x^0). """ X_noiseless = X_noiseless.reshape( (-1, self.n_colors, self.spatial_width, self.spatial_width)) n_images = X_noiseless.shape[0].astype('int16') rng = Random().theano_rng # choose a timestep in [1, self.trajectory_length-1]. # note the reverse process is fixed for the very # first timestep, so we skip it. # TODO for some reason random_integer is missing from the Blocks # theano random number generator. t = T.floor(rng.uniform(size=(1,1), low=1, high=self.trajectory_length, dtype=theano.config.floatX)) t_weights = self.get_t_weights(t) N = rng.normal(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width), dtype=theano.config.floatX) # noise added this time step beta_forward = self.get_beta_forward(t) # decay in noise variance due to original signal this step alpha_forward = 1. - beta_forward # compute total decay in the fraction of the variance due to X_noiseless alpha_arr = 1. - self.beta_arr alpha_cum_forward_arr = T.extra_ops.cumprod(alpha_arr).reshape((self.trajectory_length,1)) alpha_cum_forward = T.dot(t_weights.T, alpha_cum_forward_arr) # total fraction of the variance due to noise being mixed in beta_cumulative = 1. - alpha_cum_forward # total fraction of the variance due to noise being mixed in one step ago beta_cumulative_prior_step = 1. - alpha_cum_forward/alpha_forward # generate the corrupted training data X_uniformnoise = X_noiseless + (rng.uniform(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width), dtype=theano.config.floatX)-T.constant(0.5,dtype=theano.config.floatX))*T.constant(self.uniform_noise,dtype=theano.config.floatX) X_noisy = X_uniformnoise*T.sqrt(alpha_cum_forward) + N*T.sqrt(1. - alpha_cum_forward) # compute the mean and covariance of the posterior distribution mu1_scl = T.sqrt(alpha_cum_forward / alpha_forward) mu2_scl = 1. / T.sqrt(alpha_forward) cov1 = 1. - alpha_cum_forward/alpha_forward cov2 = beta_forward / alpha_forward lam = 1./cov1 + 1./cov2 mu = ( X_uniformnoise * mu1_scl / cov1 + X_noisy * mu2_scl / cov2 ) / lam sigma = T.sqrt(1./lam) sigma = sigma.reshape((1,1,1,1)) mu.name = 'mu q posterior' sigma.name = 'sigma q posterior' X_noisy.name = 'X_noisy' t.name = 't' return X_noisy, t, mu, sigma
def lcn_std_diff(x,size=9): # Function borrowed from bengioe_util p = x.reshape((1,1,48,48)) #p = (p-TT.mean(p))/T.std(p) g = gaussian(size,1.591/size) g/=g.sum() g = numpy.float32(g.reshape((1,1,size,size))) mean = TT.nnet.conv.conv2d(p,TT.constant(g), (1,1,48,48), (1,1,size,size), 'full').reshape((48+size-1,)*2) mean = mean[size/2:48+size/2, size/2:48+size/2] meansq = TT.nnet.conv.conv2d(TT.sqr(p),TT.constant(g), (1,1,48,48), (1,1,size,size), 'full').reshape((48+size-1,)*2) meansq = meansq[size/2:48+size/2, size/2:48+size/2] var = meansq - TT.sqr(mean) var = TT.clip(var, 0, 1e30) std = TT.sqrt(var) std = TT.clip(std, TT.mean(std), 1e30) out = (p - mean) / std return out - out.min()
def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params["grad_clip"] decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX) smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX) zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()] running_grads2 = [ theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0.0: rg2up = [ ( rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf), ) for rg2, g in zip(running_grads2, grads) ] else: rg2up = [ (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf)) for rg2, g in zip(running_grads2, grads) ] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared") updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()] updir_new = [ (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function( [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update" ) return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def lcn(x,ishape,size=9): # Function borrowed from bengioe_util """ expects x to be tensor{3|4}, the first dimension being the number of images, and the two last the shape of the image (which should be given anyways for optimization purposes """ inshape = (x.shape[0],1,ishape[0],ishape[1]) p = x.reshape(inshape) #p = (p-TT.mean(p))/T.std(p) g = gaussian(size,1.591/size) g/=g.sum() g = numpy.float32(g.reshape((1,1,size,size))) mean = TT.nnet.conv.conv2d(p,TT.constant(g), None, (1,1,size,size), 'full').reshape( (x.shape[0],1)+(ishape[0]+size-1,)*2) mean = mean[:,:, size/2:ishape[0]+size/2, size/2:ishape[1]+size/2] v = (p - mean)#.dimshuffle('x','x',0,1) var = TT.nnet.conv.conv2d(TT.sqr(v),TT.constant(g), None, (1,1,size,size), 'full').reshape( (x.shape[0],1)+(ishape[0]+size-1,)*2) var = var[:,:, size/2:ishape[0]+size/2, size/2:ishape[1]+size/2] std = TT.sqrt(var) std_mean = TT.mean(TT.mean(std,axis=3),axis=2).dimshuffle(0,1,'x','x') out = v / TT.maximum(std,std_mean) return (out + 2.5 )/5# - out.min()
def __init__(self, img_height, img_width, obj_type='circle', obj_scale=0.2): """ A class for drawing a few simple objects with subpixel resolution. """ self.img_height = img_height self.img_width = img_width self.obj_type = obj_type self.obj_scale = obj_scale # make coordinate system for points in the object to render obj_x_coords, obj_y_coords = self._construct_obj_coords( \ obj_type=self.obj_type, obj_scale=self.obj_scale) self.obj_x = T.constant(obj_x_coords) self.obj_y = T.constant(obj_y_coords) self.obj_x_range = [np.min(obj_x_coords), np.max(obj_x_coords)] self.obj_y_range = [np.min(obj_y_coords), np.max(obj_y_coords)] # make coordinate system for x and y location in the image. # -- image coordinates for the smallest dimension range over # [-init_scale....init_scale], and coordinates for the largest # dimension are at the same scale, but over a larger range. img_x_coords, img_y_coords = self._construct_img_coords( \ x_dim=self.img_width, y_dim=self.img_height) self.img_x = T.constant(img_x_coords) self.img_y = T.constant(img_y_coords) self.img_x_range = [np.min(img_x_coords), np.max(img_x_coords)] self.img_y_range = [np.min(img_y_coords), np.max(img_y_coords)] return
def _init_params_(self, kbm, kbm_mask, emb, word_size=100, hidden_size=400, prefix='KBMN_'): # L2-normalize the embedding matrix emb_ = np.sqrt(np.sum(emb ** 2, axis=1)) emb = emb / np.dot(emb_.reshape(-1, 1), np.ones((1, emb.shape[1]))) emb[0, :] = 0. self.emb = theano.shared( value=np.asarray(emb, dtype=theano.config.floatX), name=prefix + 'emb', borrow=True ) self.kbm = T.constant( x=kbm, name=prefix + 'kbm', ndim=2, dtype='int32' ) self.kbm_mask = T.constant( x=kbm_mask, name=prefix + 'kbm_mask', ndim=2, dtype=theano.config.floatX ) def _random_weights(x_dim, y_dim): return np.random.uniform( low=-np.sqrt(6. / (x_dim + y_dim)), high=np.sqrt(6. / (x_dim + y_dim)), size=(x_dim, y_dim) ).astype(theano.config.floatX) self.gru_W = theano.shared( value=np.concatenate( [_random_weights(word_size, hidden_size), _random_weights(word_size, hidden_size), _random_weights(word_size, hidden_size)], axis=1 ).astype(theano.config.floatX), name=prefix+'gru_W', borrow=True ) self.gru_U = theano.shared( value=np.concatenate( [_random_weights(hidden_size, hidden_size), _random_weights(hidden_size, hidden_size), _random_weights(hidden_size, hidden_size)], axis=1 ).astype(theano.config.floatX), name=prefix+'gru_U', borrow=True ) self.gru_B = theano.shared( value=np.zeros((3 * hidden_size,)).astype(theano.config.floatX), name=prefix+'b', borrow=True )
def test_constant(self): ## Re-init counter Variable.__count__ = count(0) r1 = tensor.constant(1.5) r2 = tensor.constant(1.5) assert r1.auto_name == "auto_0" assert r2.auto_name == "auto_1"
def test_mixture_api(): # Check basic API p1 = Normal(mu=0.0, sigma=T.constant(1.0)) p2 = Normal(mu=1.0, sigma=2.0) m = Mixture(components=[p1, p2], weights=[0.25]) assert len(m.components) == 2 assert len(m.weights) == 2 assert len(m.parameters_) == 4 assert len(m.constants_) == 1 assert len(m.observeds_) == 0 assert p1.mu in m.parameters_ assert p1.sigma in m.constants_ assert p2.mu in m.parameters_ assert p2.sigma in m.parameters_ assert m.X == p1.X assert m.X == p2.X assert m.ndim == p1.ndim assert m.ndim == p2.ndim m = Mixture(components=[p1, p2]) w = m.compute_weights() assert_array_equal(w, [0.5, 0.5]) y = T.dscalar(name="y") w1 = T.constant(0.25) w2 = y * 2 m = Mixture(components=[p1, p2], weights=[w1, w2]) assert y in m.observeds_ # Check errors assert_raises(ValueError, Mixture, components=[p1, p1, p1], weights=[1.0])
def test_transform_thin_plate_spline_variable_input(self): import lasagne from lasagne.utils import floatX from theano.tensor import constant x = np.random.random((10, 3, 28, 28)).astype('float32') x_sym = theano.tensor.tensor4() l_in = lasagne.layers.InputLayer((None, 3, None, 28)) l_loc = lasagne.layers.DenseLayer( lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)), num_units=32) l_trans = lasagne.layers.TPSTransformerLayer( l_in, l_loc, precompute_grid='auto') # check that shape propagation works assert l_trans.output_shape[0] is None assert l_trans.output_shape[1] == 3 assert l_trans.output_shape[2] is None assert l_trans.output_shape[3] == 28 # check that data propagation works dest_offset = np.zeros(shape=(10, 32)) inputs = floatX(np.arange(np.prod(x.shape)).reshape(x.shape)) outputs = l_trans.get_output_for([constant(inputs), constant(dest_offset)]).eval() np.testing.assert_allclose(inputs, outputs, atol=5e-4)
def softmax(self, D, I): D = D * T.constant(self.attrs['sharpening'], 'float32') if self.attrs['norm'] == 'exp': E = T.exp(-D) * I E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32')) elif self.attrs['norm'] == 'sigmoid': E = (numpy.float32(1) - T.tanh(D)**2) * I elif self.attrs['norm'] == 'lstm': n_out = self.attrs['template'] def lstm(z, i_t, s_p, h_p): z += T.dot(h_p, self.N_re) i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out)) ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out]) forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out]) outgate = T.nnet.sigmoid(z[:,3 * n_out:]) input = T.tanh(z[:,:n_out]) s_t = input * ingate + s_p * forgetgate h_t = T.tanh(s_t) * outgate return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')]) E = T.nnet.sigmoid(T.dot(E,self.N_out)) else: raise NotImplementedError() if self.attrs['nbest'] > 1: opt = T.minimum(self.attrs['nbest'], E.shape[0]) score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0) E = T.switch(T.lt(E,score), T.zeros_like(E), E) return E
def _allocate(self): input_dim = ((self.input_dim,) if not isinstance(self.input_dim, collections.Sequence) else self.input_dim) broadcastable = (tuple(False for _ in input_dim) if self.broadcastable is None else self.broadcastable) if len(input_dim) != len(broadcastable): raise ValueError("input_dim and broadcastable must be same length") var_dim = tuple(1 if broadcast else dim for dim, broadcast in equizip(input_dim, broadcastable)) broadcastable = broadcastable # "beta", from the Ioffe & Szegedy manuscript. if self.learn_shift: self.shift = shared_floatx_nans(var_dim, name='batch_norm_shift', broadcastable=broadcastable) add_role(self.shift, BATCH_NORM_SHIFT_PARAMETER) self.parameters.append(self.shift) else: self.shift = tensor.constant(0, dtype=theano.config.floatX) if self.learn_scale and not self.mean_only: # "gamma", from the Ioffe & Szegedy manuscript. self.scale = shared_floatx_nans(var_dim, name='batch_norm_scale', broadcastable=broadcastable) add_role(self.scale, BATCH_NORM_SCALE_PARAMETER) self.parameters.append(self.scale) else: self.scale = tensor.constant(1., dtype=theano.config.floatX) self._allocate_population_statistics(var_dim, broadcastable)
def test_transform_thin_plate_spline_shift(self): from lasagne.layers import InputLayer, TPSTransformerLayer from theano.tensor import constant batchsize = 5 num_control_points = 16 dest_offset = np.ones(shape=(batchsize, 2*num_control_points)) l_in = InputLayer((batchsize, 3, 28, 28)) l_loc = InputLayer((batchsize, 2*num_control_points)) layer = TPSTransformerLayer( l_in, l_loc, control_points=num_control_points ) image = np.zeros(shape=(28, 28)) image[[0, -1], :] = 1 image[:, [0, -1]] = 1 inputs = np.tile(image, (batchsize, 3, 1, 1)) shifted_input = np.ones(shape=(28, 28)) shifted_input[:13, :13] = 0 shifted_input[13, :13] = 0.50000271 shifted_input[:13, 13] = 0.50000271 shifted_input[13, 13] = 0.75000271 shifted_input = np.tile(shifted_input, (batchsize, 3, 1, 1)) outputs = layer.get_output_for([constant(inputs), constant(dest_offset)]).eval() np.testing.assert_allclose(shifted_input, outputs, atol=1e-5)
def __init__(self, name, data, distribution, model): """ Parameters ---------- type : theano type (optional) owner : theano owner (optional) name : str distribution : Distribution model : Model """ self.name = name data = getattr(data, 'values', data) #handle pandas args = as_iterargs(data) if len(args) > 1: params = getargspec(distribution.logp).args args = [t.constant(d, name=name + "_" + param) for d,param in zip(args,params) ] else: args = [t.constant(args[0], name=name)] self.logp_elemwiset = distribution.logp(*args) self.model = model
def test_binary_hinge_loss(): x = np.array([[-1.5, -1, -0.5, 0, 0.5, 1, 1.5]] * 2, dtype=fX) y = np.array([[0] * 7, [1] * 7], dtype=fX) res = treeano.utils.binary_hinge_loss(T.constant(x), T.constant(y)).eval() ans = np.array([[0, 0, 0.5, 1, 1.5, 2, 2.5], [2.5, 2, 1.5, 1, 0.5, 0, 0]], dtype=fX) np.testing.assert_equal(res, ans)
def test_draw_value(): npt.assert_equal(_draw_value(np.array([5, 6])), [5, 6]) npt.assert_equal(_draw_value(np.array(5.)), 5) npt.assert_equal(_draw_value(tt.constant([5., 6.])), [5, 6]) assert _draw_value(tt.constant(5)) == 5 npt.assert_equal(_draw_value(2 * tt.constant([5., 6.])), [10, 12]) val = theano.shared(np.array([5., 6.])) npt.assert_equal(_draw_value(val), [5, 6]) npt.assert_equal(_draw_value(2 * val), [10, 12]) a = tt.scalar('a') a.tag.test_value = 6 npt.assert_equal(_draw_value(2 * a, givens=[(a, 1)]), 2) assert _draw_value(5) == 5 assert _draw_value(5.) == 5 assert isinstance(_draw_value(5.), type(5.)) assert isinstance(_draw_value(5), type(5)) with pm.Model(): mu = 2 * tt.constant(np.array([5., 6.])) + theano.shared(np.array(5)) a = pm.Normal('a', mu=mu, sd=5, shape=2) val1 = _draw_value(a) val2 = _draw_value(a) assert np.all(val1 != val2) with pytest.raises(ValueError) as err: _draw_value([]) err.match('Unexpected type')
def add_param(self, param, name="", constraints=True, custom_update=None, custom_update_normalized=False, custom_update_exp_average=0, custom_update_condition=None, custom_update_accumulate_batches=None): """ :type param: theano.SharedVariable :type name: str :rtype: theano.SharedVariable """ param = super(Layer, self).add_param(param, name) if custom_update: # Handled in Device and Updater. param.custom_update = custom_update param.custom_update_normalized = custom_update_normalized param.custom_update_exp_average = custom_update_exp_average param.custom_update_condition = custom_update_condition param.custom_update_accumulate_batches = custom_update_accumulate_batches if constraints: if 'L1' in self.attrs and self.attrs['L1'] > 0: self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum() if 'L2' in self.attrs and self.attrs['L2'] > 0: self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum() if self.attrs.get('L2_eye', 0) > 0: L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX') if param.ndim == 2: eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype) self.constraints += L2_eye * ((param - eye)**2).sum() else: # standard L2 self.constraints += L2_eye * (param**2).sum() if 'varreg' in self.attrs and self.attrs['varreg'] > 0: self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2 return param
def get_updates(self, cost, learning_rate, momentum): if not self.params: self.learning_rate = T.constant(0) return {} if self.grads is None: self.grads = [theano.shared(np.zeros_like(p.get_value())) for p in self.params] # compute the gradients of the cost with respect to the parameters gparams = T.grad(cost, self.params, disconnected_inputs='ignore') grad_mult = self.conf.geteval('grad_mult', None) if grad_mult is not None: grad_mult = T.constant(grad_mult, dtype=floatX) gparams = [g * grad_mult for g in gparams] clip = self.conf.getfloat('grad_clip', None) if clip is not None: gparams = [T.clip(g, -clip, clip) for g in gparams] self.gparams = gparams # generate the list of updates gupdates = OrderedDict() pupdates = OrderedDict() self.learning_rate = self.conf.getfloat('learning_rate', None) if self.learning_rate: self.learning_rate = T.constant(self.learning_rate) else: self.learning_rate = learning_rate for (gparam, param, gold) in zip(gparams, self.params, self.grads): lrscale = self.conf.getfloat( 'learning_rate_scale_%s' % param.name, None) if lrscale is None: lrscale = self.conf.getfloat('learning_rate_scale', 1.0) decay = self.conf.getfloat('weight_decay_%s' % param.name, 0.0) lr = self.learning_rate if lrscale != 1.0: lr *= lrscale if decay: gparam += decay * param if momentum: gnew = momentum * gold + gparam gupdates[gold] = gnew pupdates[param] = param - lr * gnew else: gupdates[gold] = gparam pupdates[param] = param - lr * gparam # apply update constraints for (p, constraint) in self.constraints.iteritems(): pupdates[p] = constraint(pupdates[p]) return OrderedDict(gupdates.items() + pupdates.items())
def test_constant(self): # Get counter value autoname_id = next(Variable.__count__) Variable.__count__ = count(autoname_id) r1 = tensor.constant(1.5) r2 = tensor.constant(1.5) assert r1.auto_name == "auto_" + str(autoname_id) assert r2.auto_name == "auto_" + str(autoname_id + 1)
def __init__(self, incoming, means, covariances, weights, patch_size, pool_func=T.sum, **kwargs): self.means = T.constant(means, dtype=theano.config.floatX) self.covariances = T.constant(covariances, dtype=theano.config.floatX) self.weights = T.constant(weights, dtype=theano.config.floatX) self.patch_size = patch_size self.pool_func = pool_func super(GaussianMixtureSimilarityLayer,self).__init__(incoming, **kwargs)
def compute_output(self, network, state_vw, sampled_vw): W = T.constant(TARGET_WEIGHT) b = T.constant(TARGET_BIAS) target = T.dot(state_vw.variable, W) + b.dimshuffle("x", 0) reward = -T.sqr(sampled_vw.variable - target).sum(axis=1) network.create_vw("raw_reward", variable=T.mean(reward), shape=()) baseline_reward = 100 network.create_vw("default", variable=reward + baseline_reward, shape=(state_vw.shape[0],), tags={"output"})
def test_dtype_normal_uniform_687(self): # Regression test for #687. rng_R = random_state_type() assert uniform(rng_R, low=tensor.constant(0, dtype='float64'), dtype='float32')[1].dtype == 'float32' assert normal(rng_R, avg=tensor.constant(0, dtype='float64'), dtype='float32')[1].dtype == 'float32'
def _build_expression(self, input_expression=None): if self.pool_type not in ['max', 'avg']: raise NotImplementedError( 'Pooling only implemented for max and avg') if input_expression is None: self.input_ = T.tensor4(dtype=self.input_dtype) else: self.input_ = input_expression # Replicating caffe style pooling means zero padding # then strided pooling with ignore_border=True if self.padding in [0, (0, 0)]: padded_input = self.input_ else: zero_padder = ZeroPad(padding=self.padding) zero_padder._build_expression(self.input_) padded_input = zero_padder.expression_ if self.pool_type == 'max': pooled = fancy_max_pool(padded_input, self.pool_shape, self.pool_stride, ignore_border=False) elif self.pool_type == 'avg': # self.pool_shape needs to be a tuple avg_kernel = T.cast(T.ones((1, 1) + self.pool_shape, dtype=self.input_.dtype ) / np.prod(self.pool_shape), self.input_.dtype) n_imgs = self.input_.shape[0] n_channels = self.input_.shape[1] conv_output = T.nnet.conv2d( padded_input.reshape((n_imgs * n_channels, 1, padded_input.shape[2], padded_input.shape[3])), avg_kernel, subsample=self.pool_stride) pooled = conv_output.reshape((n_imgs, n_channels, conv_output.shape[2], conv_output.shape[3])) # A caffe quirk: The output shape is (for width, analogous for h:) # ceil((w + 2 * pad_w - kernel_w) / stride_w) + 1, instead of floor # With floor, ignore_border=True would have yielded the exact result # With ceil, sometimes we need an extra column and/or line. So we do # ignore_border=False and then crop to the right shape. Since the # shape is dynamic we need to first calculate it: # padding gotta be a tuple too pad = T.constant(self.padding) # pad = T.constant(zero_padder.padding_) # supposing here that self.pool_shape is a tuple. Should check pool_shape = T.constant(self.pool_shape) # stride hopefully a tuple, too pool_stride = T.constant(self.pool_stride, dtype='float64') float_shape = (self.input_.shape[2:4] + 2 * pad - pool_shape) / pool_stride + 1 output_shape = T.cast(T.ceil(float_shape), dtype='int64') self.expression_ = pooled[:, :, 0:output_shape[0], 0:output_shape[1]]
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False): """ check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False) Runs a basic sanity check on the `uniform` method of a `CURAND_RandomStreams` object. Checks that variates * are in the range [0, 1] * have a mean in the right neighbourhood (near 0.5) * are of the specified shape * successive calls produce different arrays of variates Parameters ---------- shape_as_symbolic : boolean If `True`, est the case that the shape tuple is a symbolic variable rather than known at compile-time. dim_as_symbolic : boolean If `True`, test the case that an element of the shape tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic` is `True`. """ rng = CURAND_RandomStreams(234) if shape_as_symbolic: # instantiate a TensorConstant with the value (10, 10) shape = constant((10, 10)) else: # Only one dimension is symbolic, with the others known if dim_as_symbolic: shape = (10, constant(10)) else: shape = (10, 10) u0 = rng.uniform(shape) u1 = rng.uniform(shape) f0 = theano.function([], u0, mode=mode_with_gpu) f1 = theano.function([], u1, mode=mode_with_gpu) v0list = [f0() for i in range(3)] v1list = [f1() for i in range(3)] # print v0list # print v1list # assert that elements are different in a few ways assert numpy.all(v0list[0] != v0list[1]) assert numpy.all(v1list[0] != v1list[1]) assert numpy.all(v0list[0] != v1list[0]) for v in v0list: assert v.shape == (10, 10) assert v.min() >= 0 assert v.max() <= 1 assert v.min() < v.max() assert .25 <= v.mean() <= .75
def dtw(i, q_p, b_p, Q, D, inf): i0 = T.eq(i, 0) # inf = T.cast(1e10,'float32') * T.cast(T.switch(T.eq(self.n,0), T.switch(T.eq(i,0), 0, 1), 1), 'float32') penalty = T.switch(T.and_(T.neg(n0), i0), big, T.constant(0.0, 'float32')) loop = T.constant(0.0, 'float32') + q_p forward = T.constant(0.0, 'float32') + T.switch(T.or_(n0, i0), 0, Q[i - 1]) opt = T.stack([loop, forward]) k_out = T.cast(T.argmin(opt, axis=0), 'int32') return opt[k_out, T.arange(opt.shape[1])] + D[i] + penalty, k_out
def test_multiclass_hinge_loss(): x = np.array([[0, 1], [1, 0], [0.5, 1.5], [0, 0.5]] * 2, dtype=fX) y = np.array([0] * 4 + [1] * 4, dtype="int32") res = treeano.utils.multiclass_hinge_loss(T.constant(x), T.constant(y)).eval() ans = np.array( [[1, 2], [1, 0], [1, 2], [1, 1.5], [0, 1], [2, 1], [0, 1], [0.5, 1]], dtype=fX) np.testing.assert_equal(res, ans)
def _run(self, num_features, num_timesteps, batch_size, mode): # determine shapes of inputs and targets depending on the batch size if batch_size == 1: inputs_size = (num_timesteps, num_features) targets_size = (num_timesteps, 1) else: inputs_size = (num_timesteps, batch_size, num_features) targets_size = (num_timesteps, batch_size, 1) # make inputs and targets shared variables inputs = theano.shared(self.rng.uniform(size=inputs_size).astype( config.floatX), borrow=True) targets = theano.shared(self.rng.uniform(size=targets_size).astype( config.floatX), borrow=True) # create symbolic inputs and targets variables if batch_size == 1: x = T.matrix('inputs') t = T.matrix('targets') else: x = T.tensor3('inputs') t = T.tensor3('inputs') x.tag.test_value = inputs.get_value(borrow=True) t.tag.test_value = targets.get_value(borrow=True) # create a set of parameters for a simple RNN W_xh = theano.shared( (0.01 * self.rng.uniform(size=(num_features, 10))).astype( config.floatX), borrow=True) W_hh = theano.shared( (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX), borrow=True) W_hy = theano.shared( (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX), borrow=True) b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True) b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True) params = [W_xh, W_hh, W_hy, b_h, b_y] # recurrent function def step(x_t, h_tm1): h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h) return h # build recurrent graph if batch_size == 1: h_0 = T.alloc(0.0, 10).astype(config.floatX) else: h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX) h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0]) # network output y = T.dot(h, W_hy) + b_y # Create Gauss-Newton-Matrix object. Not really of any use here, but I # need it for Hessian-Free optimization. gn = GaussNewtonMatrix(y) # compute MSE cost = ((t - y)**2).sum(axis=1).mean() # Compute the cost at some other point in the parameter # space. Not really of any use here, but this is how I do it # during certain iterations of CG in the HF algorithm. There, # it's in fact `pi + current update proposal`. For simplicity, # I just multiply by 2 here. cost_ = theano.clone(cost, replace=dict([(pi, 2 * pi) for pi in params])) # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG, # but for simplicity, I just take the parameters vector because it's # already there. Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0)) # compile Theano function f = theano.function([], [cost_] + Gv, givens={ x: inputs, t: targets }, mode=mode) # execute f()
def const(value): return TT.constant(numpy.asarray(value, dtype=theano.config.floatX))
def __init__(self, key_index, label_num, pretrain_name=None, encoder='lstm', word_dim=300, hidden='100_100', dropout=0.5, regularization_weight=0.0001, optimizer_name='adagrad', lr=0.1, norm_lim=-1, label2index_filename=None): self.label2index, self.index2label = self.load_label_index( label2index_filename, label_num) self.indexs = T.imatrix() # (batch, max_len) self.golden = T.ivector() # (batch, ) self.max_len = T.iscalar() # max length self.s1_mask = self.indexs[:, :self.max_len] > 0 self.s1_mask = self.s1_mask * T.constant(1.0, dtype=theano.config.floatX) if pretrain_name is None: self.embedding = WordEmbedding( key_index, dim=word_dim, initializer=UniformInitializer(scale=0.01)) else: self.embedding = WordEmbedding(key_index, filename=pretrain_name, normalize=False, binary=True) assert self.embedding.dim == word_dim self.word_embeddings = self.embedding[self.indexs[:, :self.max_len]] if type(hidden) is str: hidden_dims = [int(hid) for hid in hidden.split('_')] else: hidden_dims = [hidden] if encoder == 'lstm': encoder_layer = LSTMEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="LSTM_", dropout=dropout) elif encoder == 'bilstm': encoder_layer = BiLSTMEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiLSTM_", bidirection_shared=True, dropout=dropout) elif encoder == 'recurrent': encoder_layer = RecurrentEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="Recurrent_", dropout=dropout) elif encoder == 'birecurrent': encoder_layer = BiRecurrentEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiRecurrent_", bidirection_shared=True, dropout=dropout) elif encoder == 'gru': encoder_layer = GRUEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="GRU_", dropout=dropout) elif encoder == 'bigru': encoder_layer = BiGRUEncoder(in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='final', prefix="BiGRU_", bidirection_shared=True, dropout=dropout) elif encoder == 'cbow': encoder_layer = CBOWLayer(in_dim=word_dim, ) elif encoder == 'cnn': encoder_layer = MultiFilterConvolutionLayer( in_dim=word_dim, hidden_dim=hidden_dims[0], pooling='max', prefix="ConvLayer_", kernel_sizes=CONV_FILTER_SIZES) else: raise NotImplementedError self.text_embedding = encoder_layer.forward_batch( self.word_embeddings, self.s1_mask) if len(hidden_dims) > 1: hidden_layer = MultiHiddenLayer(in_dim=encoder_layer.out_dim, hidden_dims=hidden_dims[1:], dropout=dropout, prefix='Full_Connected_Layer_') classifier_input = hidden_layer.forward_batch(self.text_embedding) classifier_input_dim = hidden_layer.out_dim else: classifier_input = self.text_embedding classifier_input_dim = encoder_layer.out_dim self.classifier = SoftmaxClassifier(classifier_input_dim, label_num, dropout=dropout) self.predict_loss = self.classifier.loss(classifier_input, self.golden) self.predict_prob = self.classifier.forward_batch(classifier_input) self.predict_label = T.argmax(self.predict_prob, axis=1) """Params in TextClassifier""" self.params = self.classifier.params + encoder_layer.params self.l2_norm = self.classifier.l2_norm + encoder_layer.l2_norm if len(hidden_dims) > 1: self.params += hidden_layer.params self.l2_norm += hidden_layer.l2_norm self.l2_loss = regularization_weight * self.l2_norm / 2 self.loss = self.predict_loss + self.l2_loss """Opimizer and Loss""" if optimizer_name == 'adagrad': sgd_optimizer = AdaGradOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'adadelta': sgd_optimizer = AdaDeltaOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'sgd': sgd_optimizer = SGDOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'momentum': sgd_optimizer = SGDMomentumOptimizer(lr=lr, norm_lim=norm_lim) elif optimizer_name == 'adam': sgd_optimizer = AdamOptimizer(lr=lr, norm_lim=norm_lim) else: raise NotImplementedError self.train_indexs = T.ivector() self.train_data_x = shared_zero_matrix(shape=(5, 5), name="train_data_x", dtype=np.int32) self.train_data_y = shared_zero_matrix(shape=(5, ), name="train_data_y", dtype=np.int32) self.model_params = self.params + self.embedding.params """Theano Function""" if EMBEDDING_LR > 0: embedding_updates = SGDOptimizer(lr=EMBEDDING_LR, norm_lim=-1).get_update( self.loss, self.embedding.params) updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) updates.update(embedding_updates) elif EMBEDDING_LR < 0: # Optimize Embedding using Global Optimizer self.params += self.embedding.params updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) else: # Fix Embedding updates = sgd_optimizer.get_update( self.loss, self.params, norm_exc_params=self.embedding.params) self.train_batch = theano.function( inputs=[self.train_indexs, self.max_len], outputs=[self.loss, self.predict_loss, self.l2_loss], updates=updates, givens=[(self.indexs, self.train_data_x[self.train_indexs]), (self.golden, self.train_data_y[self.train_indexs])]) self.loss_batch = theano.function( inputs=[self.indexs, self.golden, self.max_len], outputs=[self.loss, self.predict_loss, self.l2_loss], ) self.pred_prob_batch = theano.function( inputs=[self.indexs, self.max_len], outputs=[self.predict_prob], ) self.pred_label_batch = theano.function( inputs=[self.indexs, self.max_len], outputs=[self.predict_label], ) self.get_l2_loss = theano.function( inputs=[], outputs=[self.l2_loss, self.l2_norm], )
def compile_theano_func_build_G_mtx(): tau_inter_x, tau_inter_y = TT.scalar('tau_inter_x'), TT.scalar( 'tau_inter_y') M, N = TT.scalar('M'), TT.scalar('N') m_grid, n_grid = TT.vector('m_grid'), TT.vector('n_grid') cross_beamShape_r, cross_beamShape_i = \ TT.tensor3('cross_beamShape_r'), TT.tensor3('cross_beamShape_i') baseline_x, baseline_y = TT.tensor3('baseline_x'), TT.tensor3('baseline_y') pi = TT.constant(np.pi) def theano_periodic_sinc(in_sig, bandwidth): eps = TT.constant(1e-10) denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth) idx_modi = TT.lt(TT.abs_(denominator), eps) numerator = TT.switch(idx_modi, TT.cos(in_sig), TT.sin(in_sig)) denominator = TT.switch(idx_modi, TT.cos(TT.true_div(in_sig, bandwidth)), denominator) return TT.true_div(numerator, denominator) # def theano_periodic_sinc(in_sig, bandwidth): # eps = TT.constant(1e-10) # numerator = TT.sin(in_sig) # denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth) # out0 = TT.true_div(numerator, denominator) # out1 = TT.true_div(TT.cos(in_sig), TT.cos(TT.true_div(in_sig, bandwidth))) # idx_modi = TT.lt(TT.abs_(denominator), eps) # out = TT.switch(idx_modi, out1, out0) # return out # define the function def f_inner(cross_beamShape_r, cross_beamShape_i, baseline_x, baseline_y, tau_inter_x, tau_inter_y, m_grid, n_grid, M, N): periodic_sinc_2d = \ TT.mul( theano_periodic_sinc( 0.5 * (TT.shape_padright(tau_inter_x * baseline_x, n_ones=1) - 2 * pi * TT.shape_padleft(m_grid, n_ones=2)), M * tau_inter_x ), theano_periodic_sinc( 0.5 * (TT.shape_padright(tau_inter_y * baseline_y, n_ones=1) - 2 * pi * TT.shape_padleft(n_grid, n_ones=2)), N * tau_inter_y ) ) G_mtx_r = TT.tensordot(cross_beamShape_r, periodic_sinc_2d, axes=[[0, 1], [0, 1]]) G_mtx_i = TT.tensordot(cross_beamShape_i, periodic_sinc_2d, axes=[[0, 1], [0, 1]]) return G_mtx_r, G_mtx_i G_mtx_r, G_mtx_i = theano.map(fn=f_inner, sequences=(cross_beamShape_r, cross_beamShape_i, baseline_x, baseline_y), non_sequences=(tau_inter_x, tau_inter_y, m_grid, n_grid, M, N))[0] # compile the function func = theano.function([ tau_inter_x, tau_inter_y, M, N, m_grid, n_grid, baseline_x, baseline_y, cross_beamShape_r, cross_beamShape_i ], [G_mtx_r, G_mtx_i], allow_input_downcast=True) return func
def run(only_forward=False): logger = afs_safe_logger.Logger( os.path.join(FLAGS.log_path, FLAGS.experiment_name) + ".log") if FLAGS.data_type == "bl": data_manager = load_boolean_data elif FLAGS.data_type == "sst": data_manager = load_sst_data elif FLAGS.data_type == "snli": data_manager = load_snli_data else: logger.Log("Bad data type.") return pp = pprint.PrettyPrinter(indent=4) logger.Log("Flag values:\n" + pp.pformat(FLAGS.FlagValuesDict())) # Load the data. raw_training_data, vocabulary = data_manager.load_data( FLAGS.training_data_path) # Load the eval data. raw_eval_sets = [] if FLAGS.eval_data_path: for eval_filename in FLAGS.eval_data_path.split(":"): eval_data, _ = data_manager.load_data(eval_filename) raw_eval_sets.append((eval_filename, eval_data)) # Prepare the vocabulary. if not vocabulary: logger.Log( "In open vocabulary mode. Using loaded embeddings without fine-tuning." ) train_embeddings = False vocabulary = util.BuildVocabulary( raw_training_data, raw_eval_sets, FLAGS.embedding_data_path, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA) else: logger.Log("In fixed vocabulary mode. Training embeddings.") train_embeddings = True # Load pretrained embeddings. if FLAGS.embedding_data_path: logger.Log("Loading vocabulary with " + str(len(vocabulary)) + " words from " + FLAGS.embedding_data_path) initial_embeddings = util.LoadEmbeddingsFromASCII( vocabulary, FLAGS.word_embedding_dim, FLAGS.embedding_data_path) else: initial_embeddings = None # Trim dataset, convert token sequences to integer sequences, crop, and # pad. logger.Log("Preprocessing training data.") training_data = util.PreprocessDataset( raw_training_data, vocabulary, FLAGS.seq_length, data_manager, eval_mode=False, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA, for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW") training_data_iter = util.MakeTrainingIterator(training_data, FLAGS.batch_size) eval_iterators = [] for filename, raw_eval_set in raw_eval_sets: logger.Log("Preprocessing eval data: " + filename) e_X, e_transitions, e_y, e_num_transitions = util.PreprocessDataset( raw_eval_set, vocabulary, FLAGS.seq_length, data_manager, eval_mode=True, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA, for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW") eval_iterators.append( (filename, util.MakeEvalIterator( (e_X, e_transitions, e_y, e_num_transitions), FLAGS.batch_size))) # Set up the placeholders. y = T.vector("y", dtype="int32") lr = T.scalar("lr") training_mode = T.scalar( "training_mode") # 1: Training with dropout, 0: Eval ground_truth_transitions_visible = T.scalar( "ground_truth_transitions_visible", dtype="int32") logger.Log("Building model.") vs = util.VariableStore(default_initializer=util.UniformInitializer( FLAGS.init_range), logger=logger) if FLAGS.model_type == "CBOW": model_cls = spinn.cbow.CBOW elif FLAGS.model_type == "RNN": model_cls = spinn.plain_rnn.RNN else: model_cls = getattr(spinn.fat_stack, FLAGS.model_type) # Generator of mask for scheduled sampling numpy_random = np.random.RandomState(1234) ss_mask_gen = T.shared_randomstreams.RandomStreams( numpy_random.randint(999999)) # Training step number ss_prob = T.scalar("ss_prob") if data_manager.SENTENCE_PAIR_DATA: X = T.itensor3("X") transitions = T.itensor3("transitions") num_transitions = T.imatrix("num_transitions") predicted_premise_transitions, predicted_hypothesis_transitions, logits = build_sentence_pair_model( model_cls, len(vocabulary), FLAGS.seq_length, X, transitions, len(data_manager.LABEL_MAP), training_mode, ground_truth_transitions_visible, vs, initial_embeddings=initial_embeddings, project_embeddings=(not train_embeddings), ss_mask_gen=ss_mask_gen, ss_prob=ss_prob) else: X = T.matrix("X", dtype="int32") transitions = T.imatrix("transitions") num_transitions = T.vector("num_transitions", dtype="int32") predicted_transitions, logits = build_sentence_model( model_cls, len(vocabulary), FLAGS.seq_length, X, transitions, len(data_manager.LABEL_MAP), training_mode, ground_truth_transitions_visible, vs, initial_embeddings=initial_embeddings, project_embeddings=(not train_embeddings), ss_mask_gen=ss_mask_gen, ss_prob=ss_prob) xent_cost, acc = build_cost(logits, y) # Set up L2 regularization. l2_cost = 0.0 for var in vs.trainable_vars: l2_cost += FLAGS.l2_lambda * T.sum(T.sqr(vs.vars[var])) # Compute cross-entropy cost on action predictions. if (not data_manager.SENTENCE_PAIR_DATA) and FLAGS.model_type not in [ "Model0", "RNN", "CBOW" ]: transition_cost, action_acc = build_transition_cost( predicted_transitions, transitions, num_transitions) elif data_manager.SENTENCE_PAIR_DATA and FLAGS.model_type not in [ "Model0", "RNN", "CBOW" ]: p_transition_cost, p_action_acc = build_transition_cost( predicted_premise_transitions, transitions[:, :, 0], num_transitions[:, 0]) h_transition_cost, h_action_acc = build_transition_cost( predicted_hypothesis_transitions, transitions[:, :, 1], num_transitions[:, 1]) transition_cost = p_transition_cost + h_transition_cost action_acc = (p_action_acc + h_action_acc ) / 2.0 # TODO(SB): Average over transitions, not words. else: transition_cost = T.constant(0.0) action_acc = T.constant(0.0) transition_cost = transition_cost * FLAGS.transition_cost_scale total_cost = xent_cost + l2_cost + transition_cost if ".ckpt" in FLAGS.ckpt_path: checkpoint_path = FLAGS.ckpt_path else: checkpoint_path = os.path.join(FLAGS.ckpt_path, FLAGS.experiment_name + ".ckpt") if os.path.isfile(checkpoint_path): logger.Log("Found checkpoint, restoring.") step, best_dev_error = vs.load_checkpoint( checkpoint_path, num_extra_vars=2, skip_saved_unsavables=FLAGS.skip_saved_unsavables) else: assert not only_forward, "Can't run an eval-only run without a checkpoint. Supply a checkpoint." step = 0 best_dev_error = 1.0 # Do an evaluation-only run. if only_forward: if FLAGS.eval_output_paths: eval_output_paths = FLAGS.eval_output_paths.strip().split(":") assert len(eval_output_paths) == len( eval_iterators), "Invalid no. of output paths." else: eval_output_paths = [ FLAGS.experiment_name + "-" + os.path.split(eval_set[0])[1] + "-parse" for eval_set in eval_iterators ] # Load model from checkpoint. logger.Log("Checkpointed model was trained for %d steps." % (step, )) # Generate function for forward pass. logger.Log("Building forward pass.") if data_manager.SENTENCE_PAIR_DATA: eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [ acc, action_acc, logits, predicted_hypothesis_transitions, predicted_premise_transitions ], on_unused_input='ignore', allow_input_downcast=True) else: eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [acc, action_acc, logits, predicted_transitions], on_unused_input='ignore', allow_input_downcast=True) # Generate the inverse vocabulary lookup table. ind_to_word = {v: k for k, v in vocabulary.iteritems()} # Do a forward pass and write the output to disk. for eval_set, eval_out_path in zip(eval_iterators, eval_output_paths): logger.Log("Writing eval output for %s." % (eval_set[0], )) evaluate_expanded( eval_fn, eval_set, eval_out_path, logger, step, data_manager.SENTENCE_PAIR_DATA, ind_to_word, FLAGS.model_type not in ["Model0", "RNN", "CBOW"]) else: # Train new_values = util.RMSprop(total_cost, vs.trainable_vars.values(), lr) new_values += [(key, vs.nongradient_updates[key]) for key in vs.nongradient_updates] # Training open-vocabulary embeddings is a questionable idea right now. Disabled: # new_values.append( # util.embedding_SGD(total_cost, embedding_params, embedding_lr)) # Create training and eval functions. # Unused variable warnings are supressed so that num_transitions can be passed in when training Model 0, # which ignores it. This yields more readable code that is very slightly slower. logger.Log("Building update function.") update_fn = theano.function([ X, transitions, y, num_transitions, lr, training_mode, ground_truth_transitions_visible, ss_prob ], [total_cost, xent_cost, transition_cost, action_acc, l2_cost, acc], updates=new_values, on_unused_input='ignore', allow_input_downcast=True) logger.Log("Building eval function.") eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [acc, action_acc], on_unused_input='ignore', allow_input_downcast=True) logger.Log("Training.") # Main training loop. for step in range(step, FLAGS.training_steps): if step % FLAGS.eval_interval_steps == 0: for index, eval_set in enumerate(eval_iterators): acc = evaluate(eval_fn, eval_set, logger, step) if FLAGS.ckpt_on_best_dev_error and index == 0 and ( 1 - acc) < 0.99 * best_dev_error and step > 1000: best_dev_error = 1 - acc logger.Log( "Checkpointing with new best dev accuracy of %f" % acc) vs.save_checkpoint(checkpoint_path + "_best", extra_vars=[step, best_dev_error]) X_batch, transitions_batch, y_batch, num_transitions_batch = training_data_iter.next( ) learning_rate = FLAGS.learning_rate * ( FLAGS.learning_rate_decay_per_10k_steps**(step / 10000.0)) ret = update_fn( X_batch, transitions_batch, y_batch, num_transitions_batch, learning_rate, 1.0, 1.0, np.exp(step * np.log(FLAGS.scheduled_sampling_exponent_base))) total_cost_val, xent_cost_val, transition_cost_val, action_acc_val, l2_cost_val, acc_val = ret if step % FLAGS.statistics_interval_steps == 0: logger.Log("Step: %i\tAcc: %f\t%f\tCost: %5f %5f %5f %5f" % (step, acc_val, action_acc_val, total_cost_val, xent_cost_val, transition_cost_val, l2_cost_val)) if step % FLAGS.ckpt_interval_steps == 0 and step > 0: vs.save_checkpoint(checkpoint_path, extra_vars=[step, best_dev_error])
def build_model_core(self): # gradient clipping function self.clipg = lambda x: grad_clip( x, -self.conf['GRAD_CLIP_SIZE'], self.conf['GRAD_CLIP_SIZE']) shared_layers = {} if self.conf['BATCH_NORM']: if not hasattr(self, 'gamma_h'): gamma_h_val = np.ones( (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX) shared_layers['gamma_h'] = gamma_h_val if not hasattr(self, 'beta_h'): beta_h_val = np.zeros( (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX) shared_layers['beta_h'] = beta_h_val # set the default network weights if not hasattr(self, 'wemb'): wemb_val = init_layer_k( self.conf['vocab_size'], self.conf['emb_size']) shared_layers['wemb'] = wemb_val if not hasattr(self, 'h0_hidden'): h0_hidden_val = np.zeros( (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX) shared_layers['h0_hidden'] = h0_hidden_val if not hasattr(self, 'h0_cell'): h0_cell_val = np.zeros( (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX) shared_layers['h0_cell'] = h0_cell_val # mapping from visual space to word space if not hasattr(self, 'wvm'): wvm_val = init_layer_k( self.conf['visual_size'], self.conf['emb_size']) shared_layers['wvm'] = wvm_val if not hasattr(self, 'bmv'): bmv_val = np.zeros( (self.conf['emb_size'],), dtype=theano.config.floatX) shared_layers['bmv'] = bmv_val # LSTM layer parameters if not hasattr(self, 'w_lstm'): w_lstm_val = init_layer_k( self.conf['lstm_hidden_size']*2, self.conf['lstm_hidden_size']*4) shared_layers['w_lstm'] = w_lstm_val # mapping from RNN hidden output to vocabulary if not hasattr(self, 'w'): w_val = init_layer_k( self.conf['lstm_hidden_size'], self.conf['output_size']) shared_layers['w'] = w_val if not hasattr(self, 'b'): b_val = np.zeros( (self.conf['output_size'],), dtype=theano.config.floatX) if self.conf["INIT_OUTPUT_BIAS"]: # set the bias on the last layer to be the log prob of each of the words in the vocab wcount = 0 w2i = self.dp.w2i w2c = self.dp.get_word_counts(RNNDataProvider.TRAIN) for w in w2i: if w in w2c: wcount += w2c[w] wcount += self.X_train.shape[0] b_val[w2i[RNNDataProvider.STOP_TOKEN]] = np.log( self.X_train.shape[0]/float(wcount)) for w in w2i: if w in w2c: b_val[w2i[w]] = np.log(w2c[w]/float(wcount)) b_val -= np.max(b_val[1:]) shared_layers['b'] = b_val self.build_shared_layers(shared_layers) # input variables for training self.x = T.imatrix("x") self.v = T.matrix("v") self.xlen = T.matrix("xlen") # input variables for generation self.v_single = T.vector("v") self.nstep = T.iscalar("nstep") # the dropout masks self.x_drop = T.tensor3("x_drop") # drop the input self.y_drop = T.tensor3("y_drop") # drop the output self.forced_word = T.imatrix("forced_word") h_tm1 = T.vector("h_tm1") # hidden layer ouput word_t = T.ivector("word_t") # word indexes v_i = T.vector("v") # visual information # Generates the next word based on the: previous true word, hidden state & visual features # inputs: hiddent_layer, last_predicted word, visual features def recurrance(word_t, x_drop_slice, hh_drop_slice, use_v, h_tm1_hidden, h_tm1_cell, v_i): #word_t = theano.printing.Print("word_t")(word_t) # get the word embedding matrix or the context information if self.conf['DECODER']: x_t = ifelse(T.eq(use_v, 1), T.dot( v_i, self.wvm) + self.bmv, self.wemb[word_t]) else: x_t = ifelse(T.eq(use_v, 1), T.zeros_like( self.wemb[word_t]), self.wemb[word_t]) # if we are not doing minibatch training if word_t.ndim == 0: x_t = x_t.reshape((1, x_t.shape[0])) h_tm1_hidden = h_tm1_hidden.reshape((1, h_tm1_hidden.shape[0])) h_tm1_cell = h_tm1_cell.reshape((1, h_tm1_cell.shape[0])) # dropout on the input embddings if self.conf['DROP_INPUT']: x_t *= x_drop_slice # clip the gradients so they dont get too large h_tm1_hidden_clip = self.clipg(h_tm1_hidden) in_state = T.concatenate([x_t, h_tm1_hidden_clip], axis=1) if self.conf['BATCH_NORM']: mu = T.mean(in_state, axis=0, keepdims=True) var = T.var(in_state, axis=0, keepdims=True) normed_is = (in_state - mu) / T.sqrt(var + T.constant(1e-10, dtype=theano.config.floatX)) in_state = self.gamma_h * in_state + self.beta_h # calculate 8 dot products in one go dot_out = T.dot(in_state, self.w_lstm) lstm_hidden_size = self.conf['lstm_hidden_size'] # input gate ig = T.nnet.sigmoid(dot_out[:, :lstm_hidden_size]) # forget gate fg = T.nnet.sigmoid( dot_out[:, lstm_hidden_size:lstm_hidden_size*2]) # output gate og = T.nnet.sigmoid( dot_out[:, lstm_hidden_size*2:lstm_hidden_size*3]) # cell memory cc = fg * h_tm1_cell + ig * T.tanh(dot_out[:, lstm_hidden_size*3:]) # hidden state hh = og * cc # drop the output state if self.conf['DROP_OUTPUT']: hh_d = hh * hh_drop_slice # the distribution over output words if self.conf['SOFTMAX_OUT']: s_t = T.nnet.softmax(T.dot(hh_d, self.w) + self.b) else: s_t = T.nnet.sigmoid(T.dot(hh_d, self.w) + self.b) #hh = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_hidden, hh) #cc = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_cell, cc) if not self.conf['DECODER']: keep_idx = T.and_(T.eq(word_t, 0), T.eq(use_v, 0)) #keep_idx = theano.printing.Print("keep_idx")(keep_idx) if word_t.ndim != 0: keep_idx = keep_idx.dimshuffle((0, 'x')) #hh_ret = hh #hh_ret[keep_idx, :] = h_tm1_hidden[keep_idx, :] hh_ret = keep_idx * h_tm1_hidden + (1-keep_idx) * hh cc_ret = keep_idx * h_tm1_cell + (1-keep_idx) * cc else: hh_ret = hh cc_ret = cc # if we are not doing minibatch training if word_t.ndim == 0: hh_ret = hh_ret[0] cc_ret = cc_ret[0] return [hh_ret, cc_ret, s_t] # Generates the next word by feeding the old word as input # inputs: hiddent_layer, last_predicted word, visual features def recurrance_word_feedback(h_tm1_hidden, h_tm1_cell, word_t, use_visual, v_i): x_drop_val = T.ones( (self.conf['emb_size'],), dtype=theano.config.floatX) y_drop_val = T.ones( (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX) [hh, cc, s_t] = recurrance( word_t, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i) # the predicted word w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32')[0] return [hh, cc, s_t[0], w_idx, T.zeros((0,), dtype='int32')[0]] def recurrance_partial_word_feedback(word_t_real, x_drop_val, y_drop_val, use_visual, forced_word, h_tm1_hidden, h_tm1_cell, word_t_pred, v_i): word_last = T.switch(forced_word, word_t_real, word_t_pred) [hh, cc, s_t] = recurrance( word_last, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i) # the predicted word w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32') return [hh, cc, s_t, w_idx] # build the teacher forcing loop use_visual_info = T.concatenate([T.ones((1,), dtype=np.int32), T.zeros( (self.conf['MAX_SENTENCE_LEN'],), dtype=np.int32)]) if self.conf['DECODER']: #h0_hidden_matrix = self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']] h0_hidden_matrix = self.h0_hidden * \ T.ones((self.x.shape[0], self.h0_hidden.shape[0])) v_input = T.concatenate( [self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']], self.v], axis=1) #v_input = T.printing.Print("v_input")(v_input) else: h0_hidden_matrix = self.h0_hidden * \ T.ones((self.x.shape[0], self.h0_hidden.shape[0])) v_input = self.v #v_input = T.printing.Print("v_input_v")(v_input) h0_cell_matrix = self.h0_cell * \ T.ones((self.x.shape[0], self.h0_cell.shape[0])) x_adj = T.concatenate( [T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype), self.x.T]) y_adj = T.concatenate( [self.x.T, T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype)]) [self.hh_out, self.cc_out, s], _ = theano.scan(fn=recurrance, sequences=[x_adj, self.x_drop.dimshuffle( (1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)), use_visual_info], n_steps=self.conf['MAX_SENTENCE_LEN']+1, non_sequences=v_input, outputs_info=[h0_hidden_matrix, h0_cell_matrix, None]) # build the semi-forced loop [_, _, s_semi, _], _ = theano.scan(fn=recurrance_partial_word_feedback, sequences=[x_adj, self.x_drop.dimshuffle((1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)), use_visual_info, self.forced_word[:, :self.x.shape[0]]], n_steps=self.conf['MAX_SENTENCE_LEN']+1, non_sequences=self.v, outputs_info=[h0_hidden_matrix, h0_cell_matrix, None, T.zeros((self.x.shape[0],), dtype=np.int32)]) # build the un-forced loop [_, _, _, self.wout_fb, _], _ = theano.scan(fn=recurrance_word_feedback, non_sequences=self.v_single, outputs_info=[self.h0_hidden, self.h0_cell, None, np.array( 0, dtype=np.int32), T.ones((1,), dtype=np.int32)[0]], n_steps=self.nstep) if self.conf['SEMI_FORCED'] < 1: s = s_semi self.new_s = s.reshape((s.shape[0] * s.shape[1], s.shape[2])) softmax_out = self.build_loss_function(self.new_s, y_adj) self.softmax_out = softmax_out # calculate the perplexity ff_small = T.constant(1e-20, dtype=theano.config.floatX) ppl_idx = softmax_out.shape[1] * \ T.arange(softmax_out.shape[0]) + T.flatten(y_adj) hsum = -T.log2(T.flatten(softmax_out)[ppl_idx] + ff_small) hsum_new = hsum.reshape((s.shape[0], s.shape[1])).T self.perplexity_sentence = 2 ** (T.sum(hsum_new, axis=1) / T.sum(self.xlen, axis=1)) self.perplexity_batch = 2 ** (T.sum(hsum * T.flatten(self.xlen.T)) / T.sum(self.xlen)) self.perplexity_batch_v = T.sum(hsum * T.flatten(self.xlen.T)) self.perplexity_batch_n = T.sum(self.xlen) # build the single step code h_hid = T.vector("h_hid") h_cell = T.vector("h_cell") x_drop_val = T.ones( (self.conf['emb_size'],), dtype=theano.config.floatX) y_drop_val = T.ones( (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX) use_v = T.iscalar("use_v") word_t_s = T.iscalar("word_t_s") one_step_theano = recurrance( word_t_s, x_drop_val, y_drop_val, use_v, h_hid, h_cell, v_i) if self.conf['DECODER']: self.one_step = theano.function( [word_t_s, use_v, h_hid, h_cell, v_i], outputs=one_step_theano) else: tmp_x = T.imatrix("tmp_x") tmp_v = T.matrix("tmp_v") x_d_tmp = T.ones( (1, self.conf['MAX_SENTENCE_LEN'], self.conf['emb_size']), dtype=theano.config.floatX) y_d_tmp = T.ones( (1, self.conf['MAX_SENTENCE_LEN'], self.conf['lstm_hidden_size']), dtype=theano.config.floatX) x_d_tmp.type.broadcastable = (False, False, False) y_d_tmp.type.broadcastable = (False, False, False) self.start_step = theano.function([tmp_x, tmp_v], outputs=self.hh_out[self.conf['MAX_SENTENCE_LEN']], givens={self.x_drop: x_d_tmp, self.y_drop: y_d_tmp, self.x: tmp_x, self.v: tmp_v})
def build_model_trainer(self): self.X_sh_train_mask = theano.shared( name="X_sh_train_mask", value=self.X_train_mask, borrow=True) self.X_sh_train = theano.shared( name="X_sh_train", value=self.X_train, borrow=True) self.V_sh_train = theano.shared( name="V_sh_train", value=self.V_train, borrow=True) if self.conf["DECODER"]: self.X_sh_train_drop = theano.shared( name="X_sh_train_drop", value=self.X_train_drop, borrow=True) self.Y_sh_train_drop = theano.shared( name="Y_sh_train_drop", value=self.Y_train_drop, borrow=True) if self.conf['JOINED_MODEL']: self.X_sh_train_lm = theano.shared( name="X_sh_train_lm", value=self.X_train_lm, borrow=True) params_train = [getattr(self, p) for p in self.conf['param_names_trainable']] # build the list of masks (which select which rows may be backpropagated) params_bp_mask = [] for name in self.conf['param_names_trainable']: if name in self.conf['params_bp_mask']: params_bp_mask.append(self.conf['params_bp_mask'][name]) else: params_bp_mask.append(None) if self.conf["DECODER"]: encoder_params = [getattr(self.encoder, p) for p in self.encoder.conf['param_names_trainable']] params_train = params_train + encoder_params for name in self.encoder.conf['param_names_trainable']: if name in self.encoder.conf['params_bp_mask']: params_bp_mask.append( self.encoder.conf['params_bp_mask'][name]) else: params_bp_mask.append(None) # storage for historical gradients if not self.loaded_model or (not hasattr(self, 'hist_grad') and not hasattr(self, 'delta_grad')): self.hist_grad = [theano.shared(value=np.zeros_like( var.get_value()), borrow=True) for var in params_train] self.delta_grad = [theano.shared(value=np.zeros_like( var.get_value()), borrow=True) for var in params_train] if not self.conf["DECODER"]: return # calculate the cost for this minibatch (add L2 reg to loss function) regc = T.constant(self.conf['L2_REG_CONST'], dtype=theano.config.floatX) self.cost = self.loss + regc * \ np.sum([(xx ** 2).sum() for xx in params_train]) # build the SGD weight updates batch_size_f = T.constant( self.conf['batch_size_val'], dtype=theano.config.floatX) comp_grads = T.grad(self.cost, params_train) # if self.conf['DECODER']: #comp_grads[9] = T.printing.Print("Comp_grads_9")(comp_grads[9]) + 0.0000001*T.printing.Print("params_train_9")(params_train[9]) comp_grads = [g/batch_size_f for g in comp_grads] comp_grads = [T.clip(g, -self.conf['GRAD_CLIP_SIZE'], self.conf['GRAD_CLIP_SIZE']) for g in comp_grads] #comp_grads = [g*m if m is not None else g for g,m in zip(comp_grads, params_bp_mask) ] weight_updates = get_sgd_weight_updates(self.conf['GRAD_METHOD'], comp_grads, params_train, self.hist_grad, self.delta_grad, decay=self.conf['DECAY_RATE'], learning_rate=self.conf['LEARNING_RATE']) print("Weight updates:", len(weight_updates)) # if self.conf['DECODER']: # weight_updates[9] = (weight_updates[9][0], T.printing.Print("Comp_grads_9")(weight_updates[9][1])) indx = T.iscalar("indx") indx_wrap = indx % ( self.X_sh_train_drop.shape[0] - self.conf['batch_size_val']) indx_wrap2 = ( indx+1) % (self.X_sh_train_drop.shape[0] - self.conf['batch_size_val']) if self.conf['JOINED_MODEL']: self.train = theano.function([indx], outputs=[self.loss, self.cost, self.perplexity_batch], updates=weight_updates, givens={ self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']], self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']], self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.mm_rnn.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']], self.mm_rnn.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']], self.mm_rnn.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], self.mm_rnn.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.mm_rnn.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.lm_rnn.x: self.X_sh_train_lm[indx:indx+self.conf['batch_size_val']], # self.V_sh_train[indx:indx+self.conf['batch_size_val']], self.lm_rnn.v: np.ones((self.conf['batch_size_val'], 1), dtype=theano.config.floatX), self.lm_rnn.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], self.lm_rnn.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.lm_rnn.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap + \ self.conf['batch_size_val']] }, on_unused_input='ignore') else: if self.conf['SEMI_FORCED'] < 1: inputs = [indx, self.forced_word] else: inputs = [indx] if self.conf['DECODER']: print(len(comp_grads)) print(len(params_train)) print(weight_updates) self.train = theano.function(inputs, outputs=[ self.loss, self.cost, self.perplexity_batch], updates=weight_updates, givens={ self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']], self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']], self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], self.encoder.x: self.encoder.X_sh_train[indx:indx+self.conf['batch_size_val']], self.encoder.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']], self.encoder.xlen: self.encoder.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], self.encoder.x_drop: self.X_sh_train_drop[indx_wrap2:indx_wrap2+self.conf['batch_size_val']], self.encoder.y_drop: self.Y_sh_train_drop[indx_wrap2:indx_wrap2+self.conf['batch_size_val']]}, on_unused_input='ignore') # else: # self.train = theano.function(inputs, # outputs=[self.loss, self.cost, self.perplexity_batch], # updates=weight_updates, # givens={ # self.x: self.X_sh_train[indx:indx+self.conf['batch_size_val']], # self.v: self.V_sh_train[indx:indx+self.conf['batch_size_val']], # self.xlen: self.X_sh_train_mask[indx:indx+self.conf['batch_size_val']], # self.x_drop: self.X_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']], # self.y_drop: self.Y_sh_train_drop[indx_wrap:indx_wrap+self.conf['batch_size_val']]}, # on_unused_input='ignore') if not self.conf['DECODER']: return None return self.train
m_theta = theano.shared(name='moment_theta', value=np.zeros(3, dtype=theano.config.floatX)) m_bias = theano.shared(name='moment_bias', value=np.zeros((1, 1), dtype=theano.config.floatX), broadcastable=(True, True)) v_theta = theano.shared(name='velocity_theta', value=np.zeros(3, dtype=theano.config.floatX)) v_bias = theano.shared(name='velocity_bias', value=np.zeros((1, 1), dtype=theano.config.floatX), broadcastable=(True, True)) params = [theta, bias] moments = [m_theta, m_bias] vel = [v_theta, v_bias] one = T.constant(1.0) # Feedforward Pass cost = T.mean((T.dot(theta, X.T) + bias - y)**2) / 2 cost_f = theano.function(inputs=[X, y], outputs=cost, allow_input_downcast=True) # Backward Pass gradients = T.grad(cost, params) grads = theano.function(inputs=[X, y], outputs=gradients, allow_input_downcast=True)
def fmp_shape(x, op): return fmp.DisjointPseudorandomFractionalMaxPooling2DOp( alpha=alpha, u=u)(T.constant(x)).eval().shape
def power_of_2(previous_powers, coefficients): new_values = previous_powers * coefficients index = T.argmax(new_values) return new_values, theano.scan_module.until(T.eq(index, T.constant(0)))
import numpy as np import theano as th import theano.tensor as T from theano.tensor import nlinalg from utils import jitterChol, t_repeat from GP_LVM_CMF import SGPDV floatX = th.config.floatX log2pi = T.constant(np.log(2*np.pi).astype(floatX)) class IBP_Factor(SGPDV): def __init__(self, numberOfInducingPoints, # Number of inducing ponts in sparse GP batchSize, # Size of mini batch dimX, # Dimensionality of the latent co-ordinates dimZ, # Dimensionality of the latent variables data, # [NxP] matrix of observations kernelType='RBF', encoderType_qX='FreeForm', # 'FreeForm', 'MLP', 'Kernel'. encoderType_rX='FreeForm', # 'FreeForm', 'MLP', 'Kernel', 'NoEncoding'. encoderType_ru='FreeForm', # 'FreeForm', 'MLP', 'NoEncoding' z_optimise=False, numHiddenUnits_encoder=0, numHiddentUnits_decoder=10, continuous=True ): #self, numberOfInducingPoints, batchSize, dimX, dimZ, data, numHiddenUnits, kernelType_='RBF', continuous_=True, encode_qX=True,encode_rX=False, encode_ru=False, encoder_type='kernel' ):
def __init__(self, lookup_table, in_dim, hidden_dims, labels_nums, activation, highway=False, batch_size=64, initializer=default_initializer, optimizer=None, dropout=0, verbose=True): self.batch_size = batch_size self.num_task = len(labels_nums) word_index = T.itensor3() # (batch, max_len) gold_truth = T.ivector() # (batch, 1) mask_query = (word_index > 0) * T.constant(1, dtype=theano.config.floatX) mask_user = (T.sum(word_index, axis=2) > 0) * T.constant( 1, dtype=theano.config.floatX) word_embedding = lookup_table.W[word_index] # max sum averaging hidden = get_pooling_batch_word(word_embedding, mask_query, "averaging") hidden = get_pooling_batch(hidden, mask_user, "averaging") # hidden = T.mean(hidden, axis=1) if len(hidden_dims) == 0 or hidden_dims[0] == 0: nn_output = hidden nn_output_dim = in_dim elif highway: encoder = HighwayLayer(in_dim=in_dim, activation=activation, initializer=initializer, dropout=dropout, verbose=verbose) nn_output = encoder.forward_batch(hidden) nn_output_dim = encoder.out_dim else: encoder = MultiHiddenLayer(in_dim=in_dim, hidden_dims=hidden_dims, activation=activation, initializer=initializer, dropout=dropout, verbose=verbose) nn_output = encoder.forward_batch(hidden) nn_output_dim = encoder.out_dim if optimizer is None: sgd_optimizer = AdaGradOptimizer(lr=0.95, norm_lim=16) else: sgd_optimizer = optimizer self.train_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32) self.train_y = shared_zero_matrix((1, 1), dtype=np.int32) self.dev_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32) self.test_x = shared_zero_matrix((batch_size, 1, 1), dtype=np.int32) self.train_batch_list = list() self.pred_train_batch_list = list() self.pred_dev_batch_list = list() self.pred_test_batch_list = list() self.get_y_list = list() index = T.ivector() classifier_list = list() classifier_output_list = list() classifier_loss_list = list() classifier_param_list = list() classifier_updates_list = list() for i in xrange(len(labels_nums)): classifier = SoftmaxClassifier(num_in=nn_output_dim, num_out=labels_nums[i], initializer=initializer) classifier_list.append(classifier) classifier_output_list.append( classifier_list[i].forward(nn_output)) classifier_loss_list.append(classifier_list[i].loss( nn_output, gold_truth)) if len(hidden_dims) == 0 or hidden_dims[0] == 0: classifier_param_list.append(lookup_table.params + classifier.params) else: classifier_param_list.append(lookup_table.params + classifier.params + encoder.params) except_norm_list = [param.name for param in lookup_table.params] classifier_updates_list.append( sgd_optimizer.get_update(classifier_loss_list[i], classifier_param_list[i], except_norm_list)) train_batch = theano.function( inputs=[index], outputs=[classifier_output_list[i], classifier_loss_list[i]], updates=classifier_updates_list[i], givens={ word_index: self.train_x[index], gold_truth: self.train_y[index, i] }) self.train_batch_list.append(train_batch) pred_train_batch = theano.function( inputs=[index], outputs=classifier_output_list[i], givens={word_index: self.train_x[index]}) self.pred_train_batch_list.append(pred_train_batch) pred_dev_batch = theano.function( inputs=[index], outputs=classifier_output_list[i], givens={word_index: self.dev_x[index]}) self.pred_dev_batch_list.append(pred_dev_batch) pred_test_batch = theano.function( inputs=[index], outputs=classifier_output_list[i], givens={word_index: self.test_x[index]}) self.pred_test_batch_list.append(pred_test_batch) self.get_y_list.append( theano.function(inputs=[index], outputs=self.train_y[index, i]))
) eval_fn = theano.function( [images, total_iters], cost.mean() ) train_data, dev_data, test_data = lib.mnist_binarized.load( BATCH_SIZE, TEST_BATCH_SIZE ) ############################################# ##############Importance Sampling########### log2pi = T.constant(np.log(2*np.pi).astype(theano.config.floatX)) k_ = 10 def log_mean_exp(x, axis=1): m = T.max(x, keepdims=True) return m + T.log(T.sum(T.exp(x - m), keepdims=True)) - T.log(k_) def log_lik(samples, mean, log_sigma): return -log2pi*T.cast(samples.shape[1], 'float32') / 2 - \ T.sum(T.sqr((samples-mean)/T.exp(log_sigma)) + 2*log_sigma, axis=1) / 2 vae_bound = reconst_cost + reg_cost log_lik_latent_prior = log_lik(latents, 0., 0.) log_lik_latent_posterior = log_lik(latents, mu, log_sigma) loglikelihood_normal = log_lik_latent_prior - reconst_cost - log_lik_latent_posterior
import os, sys import numpy as np import scipy as sp import PIL import theano import theano.tensor as T import pickle, cPickle from sklearn import preprocessing import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from numpy.lib import stride_tricks import theano.sandbox.rng_mrg as RNG_MRG rng = np.random.RandomState() MRG = RNG_MRG.MRG_RandomStreams(rng.randint(2**30)) c = -T.constant(np.log(2 * np.pi)).astype(theano.config.floatX) c.tag.test_value = np.log(2 * np.pi).astype(theano.config.floatX) def unpickle(path): ''' For cifar-10 data, it will return dictionary''' #Load the cifar 10 f = open(path, 'rb') data = cPickle.load(f) f.close() return data def repmat_vec(x, k): return T.tile(x.dimshuffle([0, 'x']), [1, k]).T
def normal(self, size, avg=0.0, std=1.0, ndim=None, dtype=None, nstreams=None, truncate=False, **kwargs): """ Sample a tensor of values from a normal distribution. Parameters ---------- size : int_vector_like Array dimensions for the output tensor. avg : float_like, optional The mean value for the truncated normal to sample from (defaults to 0.0). std : float_like, optional The standard deviation for the truncated normal to sample from (defaults to 1.0). truncate : bool, optional Truncates the normal distribution at 2 standard deviations if True (defaults to False). When this flag is set, the standard deviation of the result will be less than the one specified. ndim : int, optional The number of dimensions for the output tensor (defaults to None). This argument is necessary if the size argument is ambiguous on the number of dimensions. dtype : str, optional The data-type for the output tensor. If not specified, the dtype is inferred from avg and std, but it is at least as precise as floatX. kwargs Other keyword arguments for random number generation (see uniform). Returns ------- samples : TensorVariable A Theano tensor of samples randomly drawn from a normal distribution. """ size = _check_size(size) avg = undefined_grad(as_tensor_variable(avg)) std = undefined_grad(as_tensor_variable(std)) if dtype is None: dtype = scal.upcast(config.floatX, avg.dtype, std.dtype) avg = tensor.cast(avg, dtype=dtype) std = tensor.cast(std, dtype=dtype) # generate even number of uniform samples # Do manual constant folding to lower optiimizer work. if isinstance(size, theano.Constant): n_odd_samples = size.prod(dtype='int64') else: n_odd_samples = tensor.prod(size, dtype='int64') n_even_samples = n_odd_samples + n_odd_samples % 2 uniform = self.uniform((n_even_samples, ), low=0., high=1., ndim=1, dtype=dtype, nstreams=nstreams, **kwargs) # box-muller transform u1 = uniform[:n_even_samples // 2] u2 = uniform[n_even_samples // 2:] r = tensor.sqrt(-2.0 * tensor.log(u1)) theta = np.array(2.0 * np.pi, dtype=dtype) * u2 cos_theta, sin_theta = tensor.cos(theta), tensor.sin(theta) z0 = r * cos_theta z1 = r * sin_theta if truncate: # use valid samples to_fix0 = (z0 < -2.) | (z0 > 2.) to_fix1 = (z1 < -2.) | (z1 > 2.) z0_valid = z0[tensor.nonzero(~to_fix0)] z1_valid = z1[tensor.nonzero(~to_fix1)] # re-sample invalid samples to_fix0 = tensor.nonzero(to_fix0)[0] to_fix1 = tensor.nonzero(to_fix1)[0] n_fix_samples = to_fix0.size + to_fix1.size lower = tensor.constant(1. / np.e**2, dtype=dtype) u_fix = self.uniform((n_fix_samples, ), low=lower, high=1., ndim=1, dtype=dtype, nstreams=nstreams, **kwargs) r_fix = tensor.sqrt(-2. * tensor.log(u_fix)) z0_fixed = r_fix[:to_fix0.size] * cos_theta[to_fix0] z1_fixed = r_fix[to_fix0.size:] * sin_theta[to_fix1] # pack everything together to a useful result norm_samples = tensor.join(0, z0_valid, z0_fixed, z1_valid, z1_fixed) else: norm_samples = tensor.join(0, z0, z1) if isinstance(n_odd_samples, theano.Variable): samples = norm_samples[:n_odd_samples] elif n_odd_samples % 2 == 1: samples = norm_samples[:-1] else: samples = norm_samples samples = tensor.reshape(samples, newshape=size, ndim=ndim) samples *= std samples += avg return samples
def train_conv_net(datasets, U, ofile, cv=0, attr=0, img_w=300, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay=0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0][0]) filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.tensor3('x') y = T.ivector('y') mair = T.fmatrix('mair') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) conv_layers = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, image_shape=None, filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) conv_layers.append(conv_layer) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], x.shape[1], x.shape[2], Words.shape[1])) def convolve_user_statuses(statuses): layer1_inputs = [] def sum_mat(mat, out): z = ifelse(T.neq(T.sum(mat), T.constant(0)), T.constant(1), T.constant(0)) return out + z, theano.scan_module.until(T.eq(z, T.constant(0))) status_count, _ = theano.scan(fn=sum_mat, sequences=statuses, outputs_info=T.constant( 0, dtype=theano.config.floatX)) # Slice-out dummy (zeroed) sentences relv_input = statuses[:T.cast(status_count[-1], dtype='int32' )].dimshuffle(0, 'x', 1, 2) for conv_layer in conv_layers: layer1_inputs.append( conv_layer.set_input(input=relv_input).flatten(2)) features = T.concatenate(layer1_inputs, axis=1) avg_feat = T.max(features, axis=0) return avg_feat conv_feats, _ = theano.scan(fn=convolve_user_statuses, sequences=layer0_input) # Add Mairesse features layer1_input = T.concatenate([conv_feats, mair], axis=1) ##mairesse_change hidden_units[0] = feature_maps * len(filter_hs) + datasets[4].shape[ 1] ##mairesse_change classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) svm_data = T.concatenate( [classifier.layers[0].output, y.dimshuffle(0, 'x')], axis=1) #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size rand_perm = np.random.permutation(range(len(datasets[0]))) train_set_x = datasets[0][rand_perm] train_set_y = datasets[1][rand_perm] train_set_m = datasets[4][rand_perm] extra_data_x = train_set_x[:extra_data_num] extra_data_y = train_set_y[:extra_data_num] extra_data_m = train_set_m[:extra_data_num] new_data_x = np.append(datasets[0], extra_data_x, axis=0) new_data_y = np.append(datasets[1], extra_data_y, axis=0) new_data_m = np.append(datasets[4], extra_data_m, axis=0) else: new_data_x = datasets[0] new_data_y = datasets[1] new_data_m = datasets[4] rand_perm = np.random.permutation(range(len(new_data_x))) new_data_x = new_data_x[rand_perm] new_data_y = new_data_y[rand_perm] new_data_m = new_data_m[rand_perm] n_batches = new_data_x.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) #divide train set into train/val sets test_set_x = datasets[2] test_set_y = np.asarray(datasets[3], "int32") test_set_m = datasets[5] train_set_x, train_set_y, train_set_m = shared_dataset( (new_data_x[:n_train_batches * batch_size], new_data_y[:n_train_batches * batch_size], new_data_m[:n_train_batches * batch_size])) val_set_x, val_set_y, val_set_m = shared_dataset( (new_data_x[n_train_batches * batch_size:], new_data_y[n_train_batches * batch_size:], new_data_m[n_train_batches * batch_size:])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size], mair: val_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function( [index], [classifier.errors(y), svm_data], givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], mair: train_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], mair: train_set_m[index * batch_size:(index + 1) * batch_size] }, ##mairesse_change allow_input_downcast=True) test_y_pred = classifier.predict(layer1_input) test_error = T.sum(T.neq(test_y_pred, y)) true_p = T.sum(test_y_pred * y) false_p = T.sum(test_y_pred * T.mod(y + T.ones_like(y), T.constant(2, dtype='int32'))) false_n = T.sum( y * T.mod(test_y_pred + T.ones_like(y), T.constant(2, dtype='int32'))) test_model_all = theano.function( [ x, y, mair ##mairesse_change ], [test_error, true_p, false_p, false_n, svm_data], allow_input_downcast=True) test_batches = test_set_x.shape[0] / batch_size #start training over mini-batches print '... training' epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 fscore = 0 cost_epoch = 0 while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean([loss[0] for loss in train_losses]) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) epoch_perf = 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % ( epoch, time.time() - start_time, train_perf * 100., val_perf * 100.) print(epoch_perf) ofile.write(epoch_perf + "\n") ofile.flush() if val_perf >= best_val_perf: best_val_perf = val_perf test_loss_list = [ test_model_all( test_set_x[idx * batch_size:(idx + 1) * batch_size], test_set_y[idx * batch_size:(idx + 1) * batch_size], test_set_m[idx * batch_size:(idx + 1) * batch_size] ##mairesse_change ) for idx in xrange(test_batches) ] if test_set_x.shape[0] > test_batches * batch_size: test_loss_list.append( test_model_all( test_set_x[test_batches * batch_size:], test_set_y[test_batches * batch_size:], test_set_m[test_batches * batch_size:] ##mairesse_change )) test_loss_list_temp = test_loss_list test_loss_list = np.asarray([t[:-1] for t in test_loss_list]) test_loss = np.sum(test_loss_list[:, 0]) / float( test_set_x.shape[0]) test_perf = 1 - test_loss tp = np.sum(test_loss_list[:, 1]) fp = np.sum(test_loss_list[:, 2]) fn = np.sum(test_loss_list[:, 3]) tn = test_set_x.shape[0] - (tp + fp + fn) fscore = np.mean([ 2 * tp / float(2 * tp + fp + fn), 2 * tn / float(2 * tn + fp + fn) ]) svm_test = np.concatenate([t[-1] for t in test_loss_list_temp], axis=0) svm_train = np.concatenate([t[1] for t in train_losses], axis=0) output = "Test result: accu: " + str( test_perf) + ", macro_fscore: " + str(fscore) + "\ntp: " + str( tp) + " tn:" + str(tn) + " fp: " + str(fp) + " fn: " + str( fn) print output ofile.write(output + "\n") ofile.flush() # dump train and test features cPickle.dump(svm_test, open("cvte" + str(attr) + str(cv) + ".p", "wb")) cPickle.dump(svm_train, open("cvtr" + str(attr) + str(cv) + ".p", "wb")) updated_epochs = refresh_epochs() if updated_epochs != None and n_epochs != updated_epochs: n_epochs = updated_epochs print 'Epochs updated to ' + str(n_epochs) return test_perf, fscore
def recurrance(word_t, x_drop_slice, hh_drop_slice, use_v, h_tm1_hidden, h_tm1_cell, v_i): #word_t = theano.printing.Print("word_t")(word_t) # get the word embedding matrix or the context information if self.conf['DECODER']: x_t = ifelse(T.eq(use_v, 1), T.dot( v_i, self.wvm) + self.bmv, self.wemb[word_t]) else: x_t = ifelse(T.eq(use_v, 1), T.zeros_like( self.wemb[word_t]), self.wemb[word_t]) # if we are not doing minibatch training if word_t.ndim == 0: x_t = x_t.reshape((1, x_t.shape[0])) h_tm1_hidden = h_tm1_hidden.reshape((1, h_tm1_hidden.shape[0])) h_tm1_cell = h_tm1_cell.reshape((1, h_tm1_cell.shape[0])) # dropout on the input embddings if self.conf['DROP_INPUT']: x_t *= x_drop_slice # clip the gradients so they dont get too large h_tm1_hidden_clip = self.clipg(h_tm1_hidden) in_state = T.concatenate([x_t, h_tm1_hidden_clip], axis=1) if self.conf['BATCH_NORM']: mu = T.mean(in_state, axis=0, keepdims=True) var = T.var(in_state, axis=0, keepdims=True) normed_is = (in_state - mu) / T.sqrt(var + T.constant(1e-10, dtype=theano.config.floatX)) in_state = self.gamma_h * in_state + self.beta_h # calculate 8 dot products in one go dot_out = T.dot(in_state, self.w_lstm) lstm_hidden_size = self.conf['lstm_hidden_size'] # input gate ig = T.nnet.sigmoid(dot_out[:, :lstm_hidden_size]) # forget gate fg = T.nnet.sigmoid( dot_out[:, lstm_hidden_size:lstm_hidden_size*2]) # output gate og = T.nnet.sigmoid( dot_out[:, lstm_hidden_size*2:lstm_hidden_size*3]) # cell memory cc = fg * h_tm1_cell + ig * T.tanh(dot_out[:, lstm_hidden_size*3:]) # hidden state hh = og * cc # drop the output state if self.conf['DROP_OUTPUT']: hh_d = hh * hh_drop_slice # the distribution over output words if self.conf['SOFTMAX_OUT']: s_t = T.nnet.softmax(T.dot(hh_d, self.w) + self.b) else: s_t = T.nnet.sigmoid(T.dot(hh_d, self.w) + self.b) #hh = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_hidden, hh) #cc = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_cell, cc) if not self.conf['DECODER']: keep_idx = T.and_(T.eq(word_t, 0), T.eq(use_v, 0)) #keep_idx = theano.printing.Print("keep_idx")(keep_idx) if word_t.ndim != 0: keep_idx = keep_idx.dimshuffle((0, 'x')) #hh_ret = hh #hh_ret[keep_idx, :] = h_tm1_hidden[keep_idx, :] hh_ret = keep_idx * h_tm1_hidden + (1-keep_idx) * hh cc_ret = keep_idx * h_tm1_cell + (1-keep_idx) * cc else: hh_ret = hh cc_ret = cc # if we are not doing minibatch training if word_t.ndim == 0: hh_ret = hh_ret[0] cc_ret = cc_ret[0] return [hh_ret, cc_ret, s_t]
def sum_mat(mat, out): z = ifelse(T.neq(T.sum(mat), T.constant(0)), T.constant(1), T.constant(0)) return out + z, theano.scan_module.until(T.eq(z, T.constant(0)))
def rollout(x0, H, gamma0, pol, dyn, cost, angle_dims=[], z=None, mm_state=True, mm_cost=True, noisy_policy_input=True, noisy_cost_input=True, truncate_gradient=-1, extra_shared=[], split_H=2, **kwargs): ''' Given some initial state particles x0, and a prediction horizon H (number of timesteps), returns a set of trajectories sampled from the dynamics model and the discounted costs for each step in the trajectory. ''' msg = 'Building computation graph for rollout' utils.print_with_stamp(msg, 'mc_pilco.rollout') msg = 'Moment-matching [state: %s, cost:%s]' msg += ', State measurement noise [policy: %s, cost: %s]' opts = (mm_state, mm_cost, noisy_policy_input, noisy_cost_input) utils.print_with_stamp(msg % opts, 'mc_pilco.rollout') # define internal scan computations def step_rollout(z1, z2, z2_prev, cumm_cost, x, sn, gamma, *args): ''' Single step of rollout. ''' n = x.shape[0] n = n.astype(theano.config.floatX) # noisy state measruement for control xn = x + z2_prev * (0.5 * sn) if noisy_policy_input else x # get next state distribution x_next, sn_next = propagate_particles(x, xn, pol, dyn, angle_dims, **kwargs) def eval_cost(xn, mxn=None, Sxn=None): c = cost(xn, None) # moment-matching for cost if mm_cost: # compute input moments if mxn is None: mxn = xn.mean(0) if Sxn is None: Sxn = (xn.T.dot(xn) / n - tt.outer(mxn, mxn)) # propagate gaussian through cost (should be implemented in # cost func) mc = cost(mxn, Sxn)[0] # no moment-matching else: mc = c.sum() / n return mc, c # if resampling (moment-matching for state) if mm_state: mx_next = x_next.mean(0) Sx_next = x_next.T.dot(x_next) / n - tt.outer(mx_next, mx_next) x_next = mx_next + z1.dot(tt.slinalg.cholesky(Sx_next).T) # noisy state measurement for cost xn_next = x_next if noisy_cost_input: xn_next += z2 * sn_next # get cost of applying action: mc_next, c_next = eval_cost(xn_next) else: mc_next, c_next = eval_cost(xn_next, mx_next, Sx_next) # no moment-matching for state else: # noisy state measurement for cost xn_next = x_next + z2 * sn_next if noisy_cost_input else x_next # get cost of applying action: mc_next, c_next = eval_cost(xn_next) c_next = gamma * c_next mc_next = gamma * mc_next cumm_cost += mc_next return [c_next, cumm_cost, x_next, sn_next, gamma * gamma0] # these are the shared variables that will be used in the scan graph. # we need to pass them as non_sequences here # see: http://deeplearning.net/software/theano/library/scan.html nseq = [gamma0] nseq.extend(dyn.get_intermediate_outputs()) nseq.extend(pol.get_intermediate_outputs()) nseq.extend(extra_shared) # loop over the planning horizon mode = theano.compile.mode.get_mode('FAST_RUN') accum_cost = tt.constant(0, dtype=x0.dtype) costs, trajectories = [], [] # if split_H > 1, this results in truncated BPTT H_ = tt.ceil(H * 1.0 / split_H).astype('int32') for i in range(1, split_H + 1): start_idx = (i - 1) * H_ + 1 end_idx = start_idx + H_ output = theano.scan(fn=step_rollout, sequences=[ z[0, start_idx:end_idx], z[1, start_idx:end_idx], z[1, -end_idx:-start_idx] ], outputs_info=[ None, accum_cost, x0, 1e-4 * tt.ones_like(x0), gamma0 ], non_sequences=nseq, strict=True, allow_gc=False, truncate_gradient=truncate_gradient, name="mc_pilco>rollout_scan_%d" % i, mode=mode) rollout_output, rollout_updts = output costs_i, accum_cost, trajectories_i = rollout_output[:3] accum_cost = accum_cost[-1] costs.append(costs_i) trajectories.append(trajectories_i) x0 = trajectories_i[-1, :, :] # x0 = theano.gradient.disconnected_grad(x0) costs = tt.concatenate(costs) trajectories = tt.concatenate(trajectories) trajectories.name = 'trajectories' # first axis: batch, second axis: time step costs = costs.T # first axis; batch, second axis: time step trajectories = trajectories.transpose(1, 0, 2) return [accum_cost, costs, trajectories], rollout_updts
def setup(self, params, gparams, shapes=None, max_norm=5.0, lr=0.01, eps=1e-6, rho=0.95, method="ADADELTA", beta=0.0, count=None, weight_l2=0): # Setup only once assert not self.updates if not shapes: shapes = params if not count: count = T.constant(1, dtype=FLOATX) else: count = T.cast(count, FLOATX) gcache = [ theano.shared(np.zeros_like(param.get_value(borrow=True), dtype=FLOATX), name="gcache_%s" % param.name) for param in shapes ] gcache_mean = [g / self.batch_counter for g in gcache] optimize_updates = optimize_parameters(params, gcache_mean, shapes, max_norm, lr, eps, rho, method, beta, gsum_regularization=0.0001, weight_l2=weight_l2, clip=self.clip) self.updates.extend(optimize_updates) self.caches.extend(gcache) if self.realtime: # Realtime update needs_update = self.batch_counter >= T.constant(self.batch_size) update_dict = OrderedDict() for param, update_val in optimize_updates: update_dict[param] = ifelse(needs_update, update_val, param) for cache, g in zip(gcache, gparams): update_dict[cache] = ifelse(needs_update, g, cache + g) update_dict[self.batch_counter] = ifelse( needs_update, count, self.batch_counter + count) return update_dict.items() else: # Manual update, perhaps in the end of one iteration gcache_updates = [(c, c + g) for c, g in zip(gcache, gparams)] + [ (self.batch_counter, self.batch_counter + count) ] return gcache_updates
def __init__(self, vocab, encoding, units, opt, initializer, srng, layers=1, regularizer=None, activity_reg=0, temporal_activity_reg=0, zoneout=0.5, input_droput=0.1, output_dropout=0.5, eps=1e-9): # Parameters self.vocab = vocab self.encoding = T.constant(np.int32(encoding), name='encoding') assert len(encoding.shape) == 2 x_k = encoding.shape[0] code_k = encoding.shape[1] self.lstm = LSTMModel(x_k=x_k, srng=srng, initializer=initializer, units=units, layers=layers, activity_reg=activity_reg, temporal_activity_reg=temporal_activity_reg, zoneout=zoneout, input_droput=input_droput, output_dropout=output_dropout) yw = K.variable(initializer((units, code_k))) yb = K.variable(initializer((code_k, ))) self.params = self.lstm.params + [yw, yb] # Training p1 = T.nnet.sigmoid(T.dot(self.lstm.train_y, yw) + yb) # (depth, n, code) xcode = self.encoding[self.lstm.xr, :] # (depth, n, code) assert xcode.ndim == 3 # nllrp = (xcode * T.log2(eps + p1)) + ((1 - xcode) * (T.log2(eps + 1. - p1))) # (depth, n, code) nllrp = T.switch(xcode, p1, 1. - p1) # (depth, n, code) nllr = -T.sum(T.log(eps + nllrp), axis=2) nll = T.mean(nllr, axis=None) loss_param_reg = T.constant(0.) if regularizer: for p in self.params: if p.ndim > 1: loss_param_reg += regularizer(p) loss = nll + self.lstm.loss_activity + self.lstm.loss_temporal_activity + loss_param_reg updates = opt.get_updates(loss, self.params) self.train_fun = theano.function([self.lstm.input_x], [ nll, self.lstm.loss_activity, self.lstm.loss_temporal_activity, loss_param_reg, loss ], updates=updates) # Testing # old version """ p1 = T.nnet.sigmoid(T.dot(self.lstm.test_y, yw) + yb) # (depth, n, code) #nllrp = (xcode * T.log(eps + p1)) + ((1 - xcode) * (T.log(eps + 1. - p1))) nllrp = T.switch(xcode, p1, 1. - p1) nllr = -T.sum(T.log(eps+nllrp), axis=2) # (depth, n) nll_part = T.transpose(nllr, (1, 0)) # (n, depth) self.nll_fun = theano.function([self.lstm.input_x], nll_part) """ p1 = T.nnet.sigmoid(T.dot(self.lstm.test_y, yw) + yb) # (depth, n, code) # xcode: (depth, n, code) # encoding: (x_k, code) h = (T.dot(T.log(eps + p1), T.transpose(self.encoding, (1, 0))) + T.dot(T.log(eps + 1. - p1), T.transpose(1 - self.encoding, (1, 0))) ) # (depth, n, x_k) p2 = softmax_nd(h) # (depth, n, x_k) mg = T.mgrid[0:p2.shape[0], 0:p2.shape[1]] pt = p2[mg[0], mg[1], self.lstm.xr] # (depth, n) nll_part = T.transpose(-T.log(eps + pt), (1, 0)) self.nll_fun = theano.function([self.lstm.input_x], nll_part) train_headers = [ 'NLL', 'Activity Reg', 'Temporal Reg', 'Weight Reg', 'Loss' ] val_headers = ['NLL', 'PPL'] weights = self.params + opt.weights super(LSTMSoftmaxSparse, self).__init__(weights=weights, train_headers=train_headers, val_headers=val_headers)
class Opt(object): merge = theano.gof.MergeOptimizer() gemm_opt_1 = theano.gof.TopoOptimizer(theano.tensor_opt.gemm_pattern_1) gemm_opt_2 = theano.gof.TopoOptimizer( # d -= a * (dot()+transpose(dot)) theano.gof.PatternSub( (T.sub_inplace, 'd', (T.mul, dict(pattern=(T.DimShuffle( (), ['x', 'x'], inplace=True), 'a'), allow_multiple_clients=True), (T.add, (T.dot, 'b', 'c'), (T.transpose_inplace, (T.dot, 'f', 'g'))))), (T.gemm, (T.gemm, 'd', (T.neg, 'a'), (T.transpose_inplace, 'g'), (T.transpose_inplace, 'f'), T.constant(1.0)), (T.neg, 'a'), 'b', 'c', T.constant(1.0)), allow_multiple_clients=False)) sqr = [] sqr.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.mul, 'x', 'x'), (T.sqr, 'x'), allow_multiple_clients=True))) sqr.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.pow, 'x', (T.DimShuffle( (), ['x', 'x'], inplace=True), T.constant(2))), (T.sqr, 'x'), allow_multiple_clients=True))) ident_opt_list = [] ident_opt_list.append( # remove explicit copies theano.gof.TopoOptimizer( theano.gof.PatternSub((T.tensor_copy, 'x'), 'x', allow_multiple_clients=True))) ident_opt_list.append( # remove double-transpose theano.gof.TopoOptimizer( theano.gof.PatternSub( (T.transpose_inplace, (T.transpose_inplace, 'x')), 'x', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.sqr, (T.sqrt, 'x')), 'x', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.sqrt, (T.sqr, 'x')), 'x', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.mul, 'x', (T.div, 'y', 'x')), 'y', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.mul, (T.div, 'y', 'x'), 'x'), 'y', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.div, (T.mul, 'y', 'x'), 'x'), 'y', allow_multiple_clients=True))) ident_opt_list.append( theano.gof.TopoOptimizer( theano.gof.PatternSub((T.div, (T.mul, 'y', 'x'), 'y'), 'x', allow_multiple_clients=True))) def __call__(self, env): self.merge(env) #eliminate identities if 0: print 'SKIPPING optimizations' else: for opt in self.ident_opt_list: opt(env) for opt in self.sqr: opt(env) self.gemm_opt_1(env) self.gemm_opt_2(env) self.merge(env)
def _infer_ndim_bcast(ndim, shape, *args): """ Infer the number of dimensions from the shape or the other arguments. Returns ------- (int, variable, tuple) triple, where the variable is an integer vector, and the tuple contains Booleans The first element returned is the inferred number of dimensions. The second element is the shape inferred (combining symbolic and constant informations from shape and args). The third element is a broadcasting pattern corresponding to that shape. """ # Find the minimum value of ndim required by the *args if args: args_ndim = max(arg.ndim for arg in args) else: args_ndim = 0 if isinstance(shape, (tuple, list)): # there is a convention that -1 means the corresponding shape of a # potentially-broadcasted symbolic arg # # This case combines together symbolic and non-symbolic shape # information shape_ndim = len(shape) if ndim is None: ndim = shape_ndim else: if shape_ndim != ndim: raise ValueError( 'ndim should be equal to len(shape), but\n', 'ndim = %s, len(shape) = %s, shape = %s' % (ndim, shape_ndim, shape)) bcast = [] pre_v_shape = [] for i, s in enumerate(shape): if hasattr(s, 'type'): # s is symbolic bcast.append(False) # todo - introspect further pre_v_shape.append(s) else: if s >= 0: pre_v_shape.append(tensor.as_tensor_variable(s)) bcast.append((s == 1)) elif s == -1: n_a_i = 0 for a in args: # ndim: _ _ _ _ _ _ # ashp: s0 s1 s2 s3 # i if i >= ndim - a.ndim: n_a_i += 1 a_i = i + a.ndim - ndim if not a.broadcastable[a_i]: pre_v_shape.append(a.shape[a_i]) bcast.append(False) break else: if n_a_i == 0: raise ValueError( ('Auto-shape of -1 must overlap' 'with the shape of one of the broadcastable' 'inputs')) else: pre_v_shape.append(tensor.as_tensor_variable(1)) bcast.append(True) else: ValueError('negative shape', s) # post-condition: shape may still contain both symbolic and # non-symbolic things if len(pre_v_shape) == 0: v_shape = tensor.constant([], dtype='int64') else: v_shape = tensor.stack(pre_v_shape) elif shape is None: # The number of drawn samples will be determined automatically, # but we need to know ndim if not args: raise TypeError(('_infer_ndim_bcast cannot infer shape without' ' either shape or args')) template = reduce(lambda a, b: a + b, args) v_shape = template.shape bcast = template.broadcastable ndim = template.ndim else: v_shape = tensor.as_tensor_variable(shape) if v_shape.ndim != 1: raise TypeError( "shape must be a vector or list of scalar, got '%s'" % v_shape) if ndim is None: ndim = tensor.get_vector_length(v_shape) bcast = [False] * ndim if v_shape.ndim != 1: raise TypeError("shape must be a vector or list of scalar, got '%s'" % v_shape) if (not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint'))): raise TypeError('shape must be an integer vector or list', v_shape.dtype) if args_ndim > ndim: raise ValueError( 'ndim should be at least as big as required by args value', (ndim, args_ndim), args) assert ndim == len(bcast) return ndim, tensor.cast(v_shape, 'int64'), tuple(bcast)
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data eps = numpy.float32(1e-24) xdata = theano.shared(data['train_x'], name='xdata') ydata = theano.shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store jacobi diagonal self.js = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] # Compute jacobi nw_outs = safe_clone(model.outs, replace=replace) final_results = dict(zip(model.params, [None] * n_params)) for nw_out, out_operator in zip(nw_outs, model.outs_operator): if out_operator == 'sigmoid': denom = numpy.float32(options['cbs']) #denom *= nw_out #denom *= (numpy.float32(1) - nw_out) elif out_operator == 'softmax': denom = numpy.float32(options['cbs']) denom *= (nw_out + eps) else: denom = numpy.float32(options['cbs']) factor = TT.sqrt(numpy.float32(1) / denom) if out_operator == 'sigmoid': tnwout = TT.nnet.sigmoid(nw_out) factor = TT.sqrt(tnwout * (numpy.float32(1) - tnwout)) * factor r = TT.sgn(srng.normal(nw_out.shape)) r = r * factor loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] jvs = TT.Lop(nw_out, loc_params, r) for lp, lj in zip(loc_params, jvs): if final_results[lp] is None: final_results[lp] = TT.sqr(lj) else: final_results[lp] = final_results[lp] + TT.sqr(lj) nw_js = [ oj + final_results[p] for oj, p in zip(args[1 + n_params:1 + 2 * n_params], model.params) ] return [args[0] + const(1)] + nw_gs + nw_js ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] ij = [ TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig + ij, n_steps=n_steps, mode=gpu_mode, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]] updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js))) grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) #theano.printing.pydotprint(self.compute_eucledian_gradients, # 'eucledian_grad', scan_graphs=True) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] self.damping = theano.shared(numpy.float32(options['mreg'])) mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], Ms=self.js, rtol=options['mrtol'], shift=self.damping, maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs, shared_data)] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc0, acc1): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) nw_cost2 = safe_clone(model.train_cost, replace=dict(zip(model.inputs, nw_inps))) return [_idx + const(1), acc0 + nw_cost, acc1 + nw_cost2] acc0 = const([0]) acc1 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0, acc1], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1].sum() / const(n_steps) cost0 = rvals[2].sum() / const(n_steps) grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)]) rho = (final_cost - cost0) / denom print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], [final_cost, rho], givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = numpy.inf n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=gpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def adadelta(loss_or_grads, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): """ Adadelta updates Scale learning rates by the ratio of accumulated gradients to accumulated updates, see [1]_ and notes for further description. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for learning_rate : float or symbolic scalar The learning rate controlling the size of update steps rho : float or symbolic scalar Squared gradient moving average decay factor epsilon : float or symbolic scalar Small value added for numerical stability Returns ------- OrderedDict A dictionary mapping each parameter to its update expression Notes ----- rho should be between 0 and 1. A value of rho close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast. rho = 0.95 and epsilon=1e-6 are suggested in the paper and reported to work for multiple datasets (MNIST, speech). In the paper, no learning rate is considered (so learning_rate=1.0). Probably best to keep it at this value. epsilon is important for the very first update (so the numerator does not become 0). Using the step size eta and a decay factor rho the learning rate is calculated as: .. math:: r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\ \\eta_t &= \\eta \\frac{\\sqrt{s_{t-1} + \\epsilon}} {\sqrt{r_t + \epsilon}}\\\\ s_t &= \\rho s_{t-1} + (1-\\rho)*(\\eta_t*g)^2 References ---------- .. [1] Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701. """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) # accu: accumulate gradient magnitudes accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) # delta_accu: accumulate update magnitudes (recursively!) delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) # update accu (as in rmsprop) accu_new = rho * accu + (one - rho) * grad**2 updates[accu] = accu_new # compute parameter update, using the 'old' delta_accu update = (grad * T.sqrt(delta_accu + epsilon) / T.sqrt(accu_new + epsilon)) updates[param] = param - learning_rate * update # update delta_accu (as accu, but accumulating updates) delta_accu_new = rho * delta_accu + (one - rho) * update**2 updates[delta_accu] = delta_accu_new return updates
def adam_vlr(loss_or_grads, params, lr_map, beta1=0.9, beta2=0.999, epsilon=1e-8): """Adam updates with Variable Learning Rates Adam updates implemented as in [1]_. Parameters ---------- loss_or_grads : symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params : list of shared variables The variables to generate update expressions for lr_map : dictionary of floats Learning rate map containing layer name and associated learning rate beta1 : float Exponential decay rate for the first moment estimates. beta2 : float Exponential decay rate for the second moment estimates. epsilon : float Constant for numerical stability. Returns ------- OrderedDict A dictionary mapping each parameter to its update expression Notes ----- The paper [1]_ includes an additional hyperparameter lambda. This is only needed to prove convergence of the algorithm and has no practical use (personal communication with the authors), it is therefore omitted here. References ---------- .. [1] Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980. """ all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) t = t_prev + 1 for param, g_t in zip(params, all_grads): a_t = lr_map[param] * T.sqrt(one - beta2**t) / (one - beta1**t) value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t v_t = beta2 * v_prev + (one - beta2) * g_t**2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def batch_norm(self, h, dim, use_shift=True, use_std=True, use_sample=0.0, force_sample=False, index=None, sample_mean=None, gamma=None, beta=None, depth_norm=False): x = h if h.ndim == 3: if index is None: index = self.index x = h.reshape((h.shape[0] * h.shape[1], h.shape[2]))[(index.flatten() > 0).nonzero()] elif h.ndim == 4: # index is sizes here assert index is not None x = h.reshape((h.shape[0] * h.shape[1] * h.shape[2], h.shape[3])) #x = x[(T.gt(x,numpy.float32(0))>0).nonzero()] mean = T.mean(x, axis=0) std = T.sqrt(T.mean((x - mean)**2, axis=0)) if sample_mean is None: sample_mean = self.add_param(theano.shared( numpy.zeros((dim, ), 'float32'), '%s_%s_mean' % (self.name, h.name)), custom_update=mean, custom_update_normalized=True) self.sample_mean = sample_mean sample_std = T.sqrt(T.mean((x - sample_mean)**2, axis=0)) if not self.train_flag and not force_sample: use_sample = 1.0 mean = T.constant(1. - use_sample, 'float32') * mean + T.constant( use_sample, 'float32') * sample_mean std = T.constant(1. - use_sample, 'float32') * std + T.constant( use_sample, 'float32') * sample_std if h.ndim == 3: mean = mean.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) std = std.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) elif h.ndim == 4: mean = mean.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) std = std.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) else: mean = mean.dimshuffle('x', 0).repeat(h.shape[0], axis=0) std = std.dimshuffle('x', 0).repeat(h.shape[0], axis=0) bn = (h - mean) / (std + numpy.float32(1e-10)) if use_std: if gamma is None: gamma = self.add_param( self.shared( numpy.zeros((dim, ), 'float32') + numpy.float32(0.1), "%s_%s_gamma" % (self.name, h.name))) self.gamma = gamma if h.ndim == 3: bn *= gamma.dimshuffle('x', 'x', 0).repeat(h.shape[0], axis=0).repeat(h.shape[1], axis=1) elif h.ndim == 4: bn *= gamma.dimshuffle('x', 'x', 'x', 0).repeat( h.shape[0], axis=0).repeat(h.shape[1], axis=1).repeat(h.shape[2], axis=2) else: bn *= gamma.dimshuffle('x', 0).repeat(h.shape[0], axis=0) if use_shift: if beta is None: beta = self.add_param( self.shared(numpy.zeros((dim, ), 'float32'), "%s_%s_beta" % (self.name, h.name))) self.beta = beta bn += beta if depth_norm: bn = bn / (T.sqrt(2)**self.D) return bn
def normal(self, size, avg=0.0, std=1.0, ndim=None, dtype=None, nstreams=None): """ :param size: Can be a list of integers or Theano variables (ex: the shape of another Theano Variable) :param dtype: The output data type. If dtype is not specified, it will be inferred from the dtype of low and high, but will be at least as precise as floatX. :param nstreams: Number of streams. """ # We need an even number of ]0,1[ samples. Then we split them # in two halves. First half becomes our U1's for Box-Muller, # second half our U2's. See Wikipedia page: # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform avg = as_tensor_variable(avg) std = as_tensor_variable(std) if dtype is None: dtype = scal.upcast(config.floatX, avg.dtype, std.dtype) avg = cast(avg, dtype) std = cast(std, dtype) evened = False constant = False if isinstance(size, tuple) and all([isinstance(i, (numpy.integer, int)) for i in size]): constant = True # Force dtype because it defaults to float when size is empty n_samples = numpy.prod(size, dtype='int64') if n_samples % 2 == 1: n_samples += 1 evened = True else: #if even, don't change, if odd, +1 n_samples = prod(size) + (prod(size) % 2) flattened = self.uniform(size=(n_samples,), dtype=dtype, nstreams=nstreams) if constant: U1 = flattened[:n_samples // 2] U2 = flattened[n_samples // 2:] else: U1 = flattened[:prod(flattened.shape) // 2] U2 = flattened[prod(flattened.shape) // 2:] #normal_samples = zeros_like(flattened) sqrt_ln_U1 = sqrt(-2.0 * log(U1)) # TypeError: 'TensorVariable' object does not support item assignment # so this doesn't work... #normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*numpy.pi*U2) #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2) # so trying this instead first_half = sqrt_ln_U1 * cos(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2) second_half = sqrt_ln_U1 * sin(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2) normal_samples = join(0, first_half, second_half) final_samples = None if evened: final_samples = normal_samples[:-1] elif constant: final_samples = normal_samples else: final_samples = normal_samples[:prod(size)] if not size: # Force the dtype to be int64, otherwise reshape complains size = tensor.constant(size, dtype='int64') final_samples = final_samples.reshape(size) final_samples = avg + std * final_samples assert final_samples.dtype == dtype return final_samples
def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None, sparse=False, cost_scale=1.0, input_scale=1.0, L1=0.0, L2=0.0, L2_eye=None, varreg=0.0, output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0, with_bias=True, mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False, carry=False, sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None, dtype='float32', **kwargs): """ :param list[NetworkBaseLayer.Layer] sources: list of source layers :param int n_out: output dim of W_in and dim of bias :param float L1: l1-param-norm regularization :param float L2: l2-param-norm regularization :param str mask: "unity" or "dropout" :type dropout: float """ super(Layer, self).__init__(**kwargs) self.index = index self.sources = sources ":type: list[Layer]" self.num_sources = len(sources) self.D = max([s.D for s in sources if isinstance(s, Layer)] + [0]) if mask is None: mask = 'none' self.set_attr('mask', mask) self.set_attr('dropout', dropout) self.set_attr('sparse', sparse) self.set_attr('bn_use_sample', bn_use_sample) self.set_attr('sparse_filtering', sparse_filtering) if not trainable: self.set_attr('trainable', trainable) # only store if not default self.gradient_scale = 0.0 # just to be sure else: self.gradient_scale = gradient_scale if gradient_scale != 1.0: self.set_attr('gradient_scale', gradient_scale) self.set_attr('layer_drop', layer_drop) assert not carry, "not supported anymore" self.set_attr('residual', residual) self.set_attr('n_out', n_out) self.set_attr('L1', L1) self.set_attr('L2', L2) if L2_eye: self.set_attr('L2_eye', L2_eye) self.device = device # if device else str(theano.config.device) for s in self.sources: s.transfer_output(self.device) self.set_attr('varreg', varreg) if output_L2_reg: self.set_attr('output_L2_reg', output_L2_reg) if output_entropy_reg: self.set_attr('output_entropy_reg', output_entropy_reg) if output_entropy_exp_reg: self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg) self.set_attr('batch_norm', batch_norm) self.set_attr('input_scale', input_scale) if y_in is not None: self.y_in = {} for k in y_in: if not isinstance(y_in[k], T.Variable): continue self.y_in[k] = time_batch_make_flat( y_in[k]) # TODO: better not flatten here... self.y_in[k].n_out = getattr(y_in[k], "n_out", None) else: self.y_in = None self.constraints = T.constant(0) if target: self.set_attr('target', target) if target_index: self.set_attr('target_index', target_index) assert target_index in self.network.j self.index = index = self.network.j[target_index] if cost_scale != 1: self.set_attr("cost_scale", cost_scale) if with_bias: self.b = self.add_param(self.create_bias(n_out), 'b_%s' % self.name) else: self.set_attr('with_bias', False) self.b = numpy.float32(0) self.mass = T.constant(1., name="mass_%s" % self.name, dtype='float32') self.masks = [None] * len(self.sources) assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask if mask == "dropout" or (mask == 'none' and dropout > 0): assert 0.0 < dropout < 1.0 # If we apply this mass during training then we don't need any mask or mass for testing. # The expected weight should be 1 in # E[x] = mass * (1-dropout) # so mass has to be 1 / (1 - dropout). self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32') from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) if self.depth > 1: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'], self.depth)), theano.config.floatX) for s in self.sources ] else: if batch_drop: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources ] else: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'], )), theano.config.floatX) for s in self.sources ]
def __init__(self, X, n_in, n_out, n_hidden_layers, n_units_in, n_units_hidden, M_lst=None, m_lst=None, sigma_W_params_lst=None, sigma_b_params_lst=None, sigma_W=1e-3, tune_sigma_W=True, sigma_b=1e-6, tune_sigma_b=True, l_W=1e-6, l_b=1e-6, diag_noise=True, approx_cols=False, divide_1st_layer_by_its_n_out=False, b_out_deterministic=False, seed=None): assert n_hidden_layers > 0, 'n_layers must be positive' n_layers = n_hidden_layers + 1 M_lst = [None] * (n_layers) if M_lst is None else M_lst m_lst = [None] * (n_layers) if m_lst is None else m_lst if sigma_W_params_lst is None: sigma_W_params_lst = [None] * (n_layers) if sigma_b_params_lst is None: sigma_b_params_lst = [None] * (n_layers) assert \ len(M_lst) == len(m_lst) == len(sigma_W_params_lst) == \ len(sigma_b_params_lst) == n_layers, \ 'length of all lists must be hte same and equal to ' \ '(n_layers + 1) where the +1 is for the output layer mapping' # set seed to ensure each layer is init differently (cf. seed += 1) seed = np.random.randint(int(1e6)) if seed is None else seed np.random.seed(seed) def activation(x): return T.nnet.relu(x, alpha=0.1) self.in_layer = GaussLayer( input=X, n_in=n_in, n_out=n_units_in, M=M_lst[0], m=m_lst[0], sigma_W=sigma_W, tune_sigma_W=tune_sigma_W, sigma_W_params=sigma_W_params_lst[0], sigma_b=sigma_b, tune_sigma_b=tune_sigma_b, sigma_b_params=sigma_b_params_lst[0], l_W=l_W, l_b=l_b, diag_noise=diag_noise, activation=activation, approx_cols=approx_cols, seed=seed, name='h1' ) self.layers = [self.in_layer] seed += 1 # specific settings necessary for initialisation of deep GPs if divide_1st_layer_by_its_n_out: sqrt_n_out = T.constant(self.in_layer.n_out ** 0.5, dtype=floatX) self.in_layer.output /= sqrt_n_out # the first hidden layer was already set up above for i in xrange(1, n_hidden_layers): prev_layer = self.layers[-1] layer = GaussLayer( input=prev_layer.output, n_in=prev_layer.n_out, n_out=n_units_hidden, M=M_lst[i], m=m_lst[i], sigma_W=sigma_W, tune_sigma_W=tune_sigma_W, sigma_W_params=sigma_W_params_lst[i], sigma_b=sigma_b, tune_sigma_b=tune_sigma_b, sigma_b_params=sigma_b_params_lst[i], l_W=l_W, l_b=l_b, diag_noise=diag_noise, activation=activation, name='h' + str(i + 1), approx_cols=approx_cols, seed=seed ) self.layers += [layer] seed += 1 # initialised separately because of the necessary linear activation prev_layer = self.layers[-1] self.out_layer = GaussLayer( input=prev_layer.output, n_in=prev_layer.n_out, n_out=n_out, M=M_lst[-1], m=m_lst[-1], sigma_W=sigma_W, tune_sigma_W=tune_sigma_W, sigma_W_params=sigma_W_params_lst[-1], sigma_b=sigma_b, tune_sigma_b=tune_sigma_b, sigma_b_params=sigma_b_params_lst[-1], l_W=l_W, l_b=l_b, diag_noise=diag_noise, b_is_deterministic=b_out_deterministic, approx_cols=approx_cols, name='out', seed=seed ) self.layers += [self.out_layer] self.softmax = SoftmaxLayer( input=self.out_layer.output, name='softmax' ) self.params = reduce( lambda x, y: x + y, [layer.grad_params for layer in self.layers] ) self.input = X self.p_y_given_x = self.softmax.p_y_given_x self.y_pred = self.softmax.y_pred self.mean_log_likelihood = self.softmax.mean_log_likelihood self.errors = self.softmax.errors # self.kl_W = T.sum([layer.kl_W() for layer in self.layers]) # self.kl_b = T.sum([layer.kl_b() for layer in self.layers]) # self.kl = self.kl_W + self.kl_b self.effect_kl_W = T.sum([layer.effect_kl_W() for layer in self.layers]) self.effect_kl_b = T.sum([layer.effect_kl_b() for layer in self.layers]) self.effect_kl = self.effect_kl_W + self.effect_kl_b