def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0): # This returns a theano variable that will be of shape (minibatch_size, ). # It will contain, for each training example, the associated square-norm of the total gradient. # If you take the element-wise square-root afterwards, you will get # the associated 2-norms, which is what you want for importance sampling. for (layer_name, D) in D_by_layer.items(): backprop_output = tensor.grad(cost, D['output']) if D.has_key('weight'): A = D['input'] B = backprop_output S, _ = theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(), sequences=[A,B]) accum = accum + S if D.has_key('bias'): B = backprop_output S, _ = theano.scan(fn=lambda B: tensor.sqr(B).sum(), sequences=[B]) accum = accum + S return accum
def contraction_penalty(self, inputs): """ Calculate (symbolically) the contracting autoencoder penalty term. Parameters ---------- inputs : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the input minibatch(es) on which the penalty is calculated. Assumed to be 2-tensors, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- jacobian : tensor_like 1-dimensional tensor representing, for each mini-batch example, the penalty of the encoder transformation. Add this to the output of a Cost object, such as SquaredError, to penalize it. """ act_grad = self._activation_grad(inputs) frob_norm = tensor.dot(tensor.sqr(act_grad), tensor.sqr(self.weights.sum(axis=0))) contract_penalty = frob_norm.sum() / inputs.shape[0] return contract_penalty
def orthogonal_penalty(W, D, epsilon=1e-6, axis=1): num = T.sqr(T.sum(W * D, axis=axis)) # n = (d^T w)^2 den = T.sum(T.sqr(W), axis=axis) * T.sum(T.sqr(D), axis=axis) # d = ||w||_2^2 * ||d||_2^2 cos = num / den # c = n / d value = cos - (epsilon**2) # v = c - epsilon^2 hinge = value * (value > 0) # h = [ v ]_+ return T.sum(hinge)
def get_mean_square_norm_gradients_variance_method_00(D_by_layer, cost, accum = 0): # This returns a theano variable that will be of shape (minibatch_size, ). # It will contain, for each training example, the associated mean of the # variance wrt the gradient of that minibatch. for (layer_name, D) in D_by_layer.items(): input = D['input'] input_square_norms = tensor.sqr(D['input']).sum(axis=1) backprop_output = tensor.grad(cost, D['output']) # I don't think that theano recomputes this. # It should be just redundant nodes in the computational graph # that end up being computed only once anyways. grad_weight = tensor.grad(cost, D['weight']) grad_bias = tensor.grad(cost, D['bias']) backprop_output_square_norms = tensor.sqr(backprop_output).sum(axis=1) if D.has_key('weight'): A = input_square_norms * backprop_output_square_norms C = tensor.sqr(grad_weight).sum() # all the terms get this "middle" expression added to them B = (backprop_output.dot(grad_weight.T) * input).sum(axis=1) accum += (A - 2*B + C) if D.has_key('bias'): # this last `sum` could be a component-wise `max` if we wanted # to carry the maximum of the variances instead of the sum of squares accum = accum + tensor.sqr(backprop_output - grad_bias.reshape((1,-1))).sum(axis=1) return accum
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = numpy.zeros_like(param.get_value()) exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) tmp=stepped_param * scale tmp=T.cast(tmp,'float32') #print param.type,tmp.type updates[param] = tmp else: updates[param] = stepped_param #print param.type,stepped_param.type return updates
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8): """ batchnorm with support for not using scale and shift parameters as well as inference values (u and s) and partial batchnorm (via a) will detect and use convolutional or fully connected version """ g = rescale b = reshift if X.ndim == 4: if u is not None and s is not None: # use normalization params given a priori b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: # compute normalization params from input b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') # batch normalize X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: # compute normalization params from input u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) # batch normalize X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g) + b else: raise NotImplementedError return X
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ print "Generating adadelta updates" updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name): if max_norm and param.name != word_vec_name: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6, norm_lim=9, word_vec_name='embedding'): updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = np.zeros_like(param.get_value()) exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name!='embedding'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def _calc_regularization_cost(self): """Calculate the regularization cost given the weight decay parameters. Only the parameters will be considered that are stored in the set self.regularize. We need to handle it manually in this class, because the weight matrices contain bias columns, which should not be considered in regularization computation. Therefore, do not!!! add W1 and W2 to self.regularize Returns ------- theano variable regularization cost depending on the parameters to be regularized and the weight decay parameters for L1 and L2 regularization. """ cost = super(SLmNce, self)._calc_regularization_cost() l1_cost = T.sum(T.abs_(self.W1[:, :-1])) l1_cost += T.sum(T.abs_(self.W2[:, :-1])) l2_cost = T.sum(T.sqr(self.W1[:, :-1])) l2_cost += T.sum(T.sqr(self.W2[:, :-1])) if self.l1_weight != 0: cost += self.l1_weight * l1_cost if self.l2_weight != 0: cost += self.l2_weight * l2_cost return cost
def applyConstraint(self, param): if param.ndim != 4 and param.ndim != 2: warnings.warn("Norm constraints are normally applied to matrices" +" or 4-dimensional tensors, but currently got " +"%d dimensions, please make sure this is the desired" +" parameter to apply norm constraints" % param.ndim) needFlip = False if param.ndim == 4: # a hack for conv layer filters prevShape = param.shape # conv layer filter shape is (nChannelOut, nChannelIn, r, c) param = param.flatten(2) # now it is (nout, nin), which is different from (nin, nout) # from fulling connected networks, so need to flip here needFlip = True if needFlip: col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True)) else: col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True)) param /= (col_norm+1e-7) param *= self.norm if needFlip: param = param.reshape(prevShape) return param
def get_updates_adadelta(grads,params,decay=0.95): decay = constantX(decay) print 'build updates with adadelta' for param, grad in zip(params, grads): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX)) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX)) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = \ decay * mean_square_grad +\ (1. - decay) * T.sqr(grad) # Compute update epsilon = constantX(1e-7) rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * grad # Accumulate updates new_mean_square_dx = \ decay * mean_square_dx + \ (1. - decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t
def AdadeltaUpdate(params,cost,stepSize=1.0,rho=0.95,epsilon=1e-6,norm_lim=9): updates=OrderedDict({}) exp_sqr_grads=OrderedDict({}) exp_sqr_update=OrderedDict({}) g_params=[] for param in params: empty=np.zeros_like(param.get_value()) exp_sqr_grads[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name) exp_sqr_update[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name) gp=T.grad(cost,param) g_params.append(gp) for param,gp in zip(params,g_params): exp_sg=exp_sqr_grads[param] exp_su=exp_sqr_update[param] update_exp_sg=rho*exp_sg+(1-rho)*T.sqr(gp)#???? updates[exp_sg]=update_exp_sg step=-(T.sqrt(exp_su+epsilon)/T.sqrt(update_exp_sg+epsilon))*gp stepped_param=param+step*stepSize update_exp_su=rho*exp_su+(1-rho)*T.sqr(step) updates[exp_su]=update_exp_su if param.get_value(borrow=True).ndim==2 and param.name!='wordVec': col_norms=T.sqrt(T.sum(T.sqr(stepped_param),axis=0)) desired_norms=T.clip(col_norms,0,T.sqrt(norm_lim))#??? scale=desired_norms/(1e-7+col_norms) updates[param]=stepped_param*scale else: updates[param]=stepped_param return updates
def get_regs(self, states_0_, states, M): """ Additional regularization terms. """ regs = 0 if self.L1_Wrec > 0: W = self.params['Wrec'] regs += self.L1_Wrec * tensor.mean(abs(W)) if self.L2_Wrec > 0: W = self.params['Wrec'] regs += self.L2_Wrec * tensor.mean(tensor.sqr(W)) #--------------------------------------------------------------------------------- # Firing rates #--------------------------------------------------------------------------------- if self.L2_r > 0: baseline = 0. M_ = (tensor.tile(M.T, (states.shape[-1], 1, 1))).T states_all = tensor.concatenate( [states_0_.reshape((1, states_0_.shape[0], states_0_.shape[1])), states], axis=0 ) r = self.f_hidden(states_all) regs += self.L2_r * tensor.sum(tensor.sqr(r - baseline)*M_)/tensor.sum(M_) #--------------------------------------------------------------------------------- return regs
def build_cost_functional_L2norm_w_reg(lambda_val,h,y_sym,Thetas): """ build_cost_functional_L2norm (with regularization) J=J_y(Theta,b) # J\equiv J_y(\Theta,b), for the L2 norm, or Euclidean space norm, but now with regularization INPUT/PARAMETERS ================ @type y_sym : theano symbolic matrix, such as T.matrix() or theano shared variable @param y_sym : output data as a symbolic theano variable or theano shared variable NOTE: y_sym = T.matrix(); # this could be a vector, but I can keep y to be "general" in size dimensions @type h : theano shared variable of size dims. (K,m) (size dim. might be (m,K) due to right action @param h : hypothesis @type Thetas : tuple, list, or (ordered) iterable of Theta's as theano shared variables, of length L @params Thetas : weights or parameters thetas for all the layers l=1,2,...L-1 NOTE: remember, we want a list of theano MATRICES, themselves, not the class RETURN/OUTPUTS ============== @type J_theta : theano symbolic expression (computational graph) """ J_theta = np.cast[theano.config.floatX](0.5) * T.mean(T.sqr(h-y_sym)) # T.sqr is element-wise operation (take the square of each element), and so it's an automorphism reg_term = T.mean( [ T.sum( T.sqr(Theta), acc_dtype=theano.config.floatX) for Theta in Thetas], acc_dtype=theano.config.floatX ) reg_term = np.cast[theano.config.floatX](lambda_val/ (2.))*reg_term J_theta = J_theta + reg_term return J_theta
def cost(self): """ :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None) :returns: cost, known_grads """ known_grads = None if self.loss == 'ce' or self.loss == 'priori': if self.attrs.get("target", "").endswith("[sparse:coo]"): assert isinstance(self.y, tuple) assert len(self.y) == 3 from NativeOp import crossentropy_softmax_and_gradient_z_sparse y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] ce, grad_z = crossentropy_softmax_and_gradient_z_sparse( self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask) return self.norm * T.sum(ce), {self.z: grad_z} if self.y_data_flat.type == T.ivector().type: # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation. # Theano fails to use it automatically; I guess our self.i indexing is too confusing. #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten()) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]]) #z_c = T.exp(self.z[:,self.y]) #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True)) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = T.set_subtensor(nll[self.j], T.constant(0.0)) else: nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T) return self.norm * T.sum(nll), known_grads elif self.loss == 'entropy': h_e = T.exp(self.y_m) #(TB) pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i]) nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB ce = nll.reshape(self.index.shape) * self.index # TB y = self.y_data_flat.reshape(self.index.shape) * self.index # TB f = T.any(T.gt(y,0), axis=0) # B return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads elif self.loss == 'priori': pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]] pcx = T.clip(pcx, 1.e-38, 1.e20) # For pcx near zero, the gradient will likely explode. return -T.sum(T.log(pcx)), known_grads elif self.loss == 'sse': if self.y_data_flat.dtype.startswith('int'): y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32') y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1)) return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads else: #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten() #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads else: assert False, "unknown loss: %s" % self.loss
def exe(self, mainloop): """ .. todo:: WRITEME """ for k, p in mainloop.updates.items(): for key in self.keys: if key in str(k): token = 1 for waiver in self.waivers: if waiver in str(k): token = 0 if token: updated_param = mainloop.updates[k] if self.is_vector: col_norms = T.sqrt(T.sqr(updated_param).sum(axis=0)) desired_norms = T.clip(col_norms, 0, self.weight_norm) ratio = (desired_norms / (1e-7 + col_norms)) mainloop.updates[k] = updated_param * ratio else: norm = T.sqrt(T.sqr(updated_param).sum()) desired_norm = T.clip(norm, 0, self.weight_norm) ratio = (desired_norm / (1e-7 + norm)) mainloop.updates[k] = updated_param * ratio
def mcmc(ll, *frvs): full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)])) loglik = -full_log_likelihood(full_observations) proposals = free_RVs_prop H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik # -- this should be an inner loop g = [] g.append(tensor.grad(loglik, frvs)) proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)] rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)] full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)])) new_loglik = -full_log_likelihood(full_observations) gnew = [] gnew.append(tensor.grad(new_loglik, rvsp)) proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)] # -- Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik dH = Hnew - H accept = tensor.or_(dH < 0., U < tensor.exp(-dH)) return [tensor.switch(accept, -new_loglik, ll)] + \ [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \ {}, theano.scan_module.until(accept)
def free_energy(self, V): """ .. todo:: WRITEME """ V_name = 'V' if V.name is None else V.name assert V.ndim == 2 bias_term = T.dot(V,self.bias_vis) bias_term.name = 'bias_term' assert len(bias_term.type.broadcastable) == 1 sq_term = 0.5 * T.sqr(V).sum(axis=1) sq_term.name = 'sq_term' assert len(sq_term.type.broadcastable) == 1 softplus_term = T.nnet.softplus( (self.transformer.lmul(V)+self.bias_hid) / T.sqr(self.sigma)).sum(axis=1) assert len(softplus_term.type.broadcastable) == 1 softplus_term.name = 'softplus_term' return ( sq_term - bias_term ) / T.sqr(self.sigma) - softplus_term
def learning_updates(self): # This code computes updates only for given R, so it drops last dimension. Plus soe theano magic to circumvent its graph comp. grads = self.grads for i, param in enumerate(self.params): mean_square_grad = theano.shared( np.zeros_like(param.get_value(), dtype=theano.config.floatX), name=param.name + str(self.network.R)+'_msg') mean_square_dx = theano.shared( np.zeros_like(param.get_value(), dtype=theano.config.floatX), name=param.name + str(self.network.R)+'_dx') # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[i]) ) # Compute update epsilon = self.lr rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - (rms_dx_tm1 / rms_grad_t) * grads[i] # Accumulate updates new_mean_square_dx = ( self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) ) # Apply update yield mean_square_grad, T.cast(new_mean_squared_grad, dtype=theano.config.floatX) yield mean_square_dx, T.cast(new_mean_square_dx, dtype=theano.config.floatX) yield param, param + 2*T.cast(delta_x_t, dtype=theano.config.floatX)
def _step(self, x_tm1, u_tm1, inputs, x_prior, u_prior, *args): # x_prior are previous states # u_prior are causes from above outputs = self.activation(T.dot(x_tm1, self.W)) rec_error = T.sqr(inputs - outputs).sum() causes = (1 + T.exp(-T.dot(u_tm1, self.V))) * .5 if self.pool_flag: batch_size = inputs.shape[0] dim = causes.shape[1] imgs = T.cast(T.sqrt(dim), 'int64') causes_up = causes.reshape( (batch_size, 1, imgs, imgs)).repeat( self.pool_size, axis=2).repeat(self.pool_size, axis=3).flatten(ndim=2) else: causes_up = causes x = _IstaStep(rec_error, x_tm1, lambdav=self.gamma*causes_up, x_prior=x_prior) if self.pool_flag: dim = T.cast(T.sqrt(x.shape[1]), 'int64') x_pool = x.reshape((batch_size, 1, dim, dim)) x_pool = max_pool_2d(x_pool, ds=(self.pool_size, )*2).flatten(ndim=2) else: x_pool = x prev_u_cost = .01 * self.gamma * T.sqr(u_tm1-u_prior).sum() u_cost = causes * abs(x_pool) * self.gamma + prev_u_cost u = _IstaStep(u_cost.sum(), u_tm1, lambdav=self.gamma) causes = (1 + T.exp(-T.dot(u, self.V))) * .5 u_cost = causes * abs(x_pool) * self.gamma return (x, u, u_cost, outputs)
def __call__(self, model, X, Y): batch_size = 32 image_size = 96 Y_hat = model.fprop(X) print "Warning: the size of the axe is set manually" Yx_hat = Y_hat[:, :image_size] Yy_hat = Y_hat[:, image_size:] Yx = Y[:, :image_size] Yy = Y[:, image_size:] epsylon = 1e-10 costMatrix = T.matrix() max_x = T.argmax(Yx, axis=1) max_y = T.argmax(Yy, axis=1) costMatrix = T.sqr( T.log((Yx + epsylon) / (Yx[range(batch_size), max_x] + epsylon)[:, None]) - T.log((Yx_hat + epsylon) / (Yx_hat[range(batch_size), max_x] + epsylon)[:, None]) ) costMatrix += T.sqr( T.log((Yy + epsylon) / (Yy[range(batch_size), max_y] + epsylon)[:, None]) - T.log((Yy_hat + epsylon) / (Yy_hat[range(batch_size), max_y] + epsylon)[:, None]) ) costMatrix *= T.neq(T.sum(Y, axis=1), 0)[:, None] cost = costMatrix.sum(axis=1).mean() return cost
def get_layer_monitoring_channels(self,state_below=None,state=None,target=None): rval=OrderedDict() W,=self.transformer.get_params() rval['norm']=T.sqrt(T.sqr(W).sum()) if(target is not None) and ((state_below is not None) or (state is not None)): if state is None: state=self.fprop(state_below) target=1.-target #0/1 dissim/sim to 1/0 distances rmse=T.sqrt(T.mean(T.sqr(state-target))) rval['rmse']=rmse.mean() if self.costfn=='margin': thresh=self.costparam elif self.costfn=='cauchy': thresh=2./(1.+T.exp(self.costparam)) else: thresh=0.5 yhat=state<thresh y=target<0.5 wrong_bit=T.cast(T.neq(y,yhat),state.dtype) rval['01_loss']=wrong_bit.mean() y=T.cast(y,state.dtype) yhat=T.cast(yhat,state.dtype) tp=(y*yhat).sum() fp=((1-y)*yhat).sum() prec=compute_precision(tp,fp) rec=compute_recall(y,tp) f1=compute_f1(prec,rec) rval['neg_precision']=-prec rval['neg_recall']=-rec rval['neg_f1']=-f1 return rval
def get_updates(self, grads): grads = OrderedDict(grads) updates = OrderedDict() for param in grads.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = theano.shared(theano._asarray( param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False) self.parameters.append(mean_square_grad) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = theano.shared(theano._asarray( param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False) self.parameters.append(mean_square_dx) # Accumulate gradient new_mean_squared_grad = self.decay * mean_square_grad + \ (1 - self.decay) * T.sqr(grads[param]) # Compute update rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param] # Accumulate updates new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
def entropy_exp(X, g=None, b=None, u=None, s=None, a=1., e=1e-8): if X.ndim == 4: if u is not None and s is not None: b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') if a != 1: b_u = (1. - a)*0. + a*b_u b_s = (1. - a)*1. + a*b_s X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: X = X*T.exp(g.dimshuffle('x', 0, 'x', 'x'))+b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) if a != 1: u = (1. - a)*0. + a*u s = (1. - a)*1. + a*s X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: X = X*T.exp(g)+b else: raise NotImplementedError return X
def batchnorm(X, g=None, b=None, u=None, s=None, a=1., e=1e-8): """ batchnorm with support for not using scale and shift parameters as well as inference values (u and s) and partial batchnorm (via a) will detect and use convolutional or fully connected version """ if X.ndim == 4: if u is not None and s is not None: b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: b_u = tensor.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = tensor.mean(tensor.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') if a != 1: b_u = (1. - a)*0. + a*b_u b_s = (1. - a)*1. + a*b_s X = (X - b_u) / tensor.sqrt(b_s + e) if g is not None and b is not None: X = X*g.dimshuffle('x', 0, 'x', 'x') + b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: u = tensor.mean(X, axis=0) s = tensor.mean(tensor.sqr(X - u), axis=0) if a != 1: u = (1. - a)*0. + a*u s = (1. - a)*1. + a*s X = (X - u) / tensor.sqrt(s + e) if g is not None and b is not None: X = X*g + b else: raise NotImplementedError return X
def create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2): i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omb1_t = 1.0 - beta1**i_t omb2_t = 1.0 - beta2**i_t lr_t = lr * (T.sqrt(omb2_t) / omb1_t) for p, g, m, v in zip(params, gparams, gsums, xsums): if is_subtensor_op(p): origin, indexes = get_subtensor_op_inputs(p) m_sub = m[indexes] v_sub = v[indexes] m_t = beta1*m_sub + (1.0-beta1)*g v_t = beta2*v_sub + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = T.set_subtensor(m_sub, m_t) updates[v] = T.set_subtensor(v_sub, v_t) updates[origin] = T.inc_subtensor(p, -lr_t*g_t) else: m_t = beta1*m + (1.0-beta1)*g v_t = beta2*v + (1.0-beta2)*T.sqr(g) g_t = m_t / (T.sqrt(v_t) + eps) updates[m] = m_t updates[v] = v_t updates[p] = p - lr_t*g_t updates[i] = i_t
def cosine_similarity(y_true, y_pred): norm_y_true = T.sqrt(T.sum(T.sqr(y_true), 1, keepdims=True)) norm_y_pred = T.sqrt(T.sum(T.sqr(y_pred), 1, keepdims=True)) dot = T.tensordot(y_true, y_pred, axes=[1,1]) cossim = dot / (norm_y_true * norm_y_pred) objective = 1-cossim return objective.mean(axis=-1)
def mse(output, target, mean_over_second=True): """ This is the Mean Square Error (MSE) across all dimensions, or per multibatch row (depending on mean_over_second). Parameters ---------- output : tensor The symbolic tensor (or compatible) output from the network. (Comes from model). target : tensor The symbolic tensor (or compatible) target truth to compare the output against. (Comes from data). mean_over_second : bool Boolean whether or not to take the mean across all dimensions (True) or just the feature dimensions (False) Returns ------- number The appropriate mean square error. """ # The following definition came from the Conditional_nade project if mean_over_second: cost = T.mean(T.sqr(target - output)) else: cost = T.mean(T.sqr(target - output).sum(axis=1)) return cost
def initialise(self): if self.X.ndim == 4: if self.u is not None and self.s is not None: b_u = self.u.dimshuffle('x',0,'x','x') b_s = self.s.dimshuffle('x',0,'x','x') else: b_u = T.mean(self.X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(self.X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') if self.a != 1: b_u = (1. - self.a)*0. + self.a*b_u b_s = (1. - self.a)*1. + self.a*b_s output = (self.X - b_u) / T.sqrt(b_s + self.e) if self.g is not None and self.b is not None: self.X = self.X*self.g.dimshuffle('x', 0, 'x', 'x') + self.b.dimshuffle('x', 0, 'x', 'x') self.params.append(g);self.params.append(b) elif self.X.ndim == 2: if self.u is None and self.s is None: self.u = T.mean(self.X, axis=0) self.s = T.mean(T.sqr(self.X - self.u), axis=0) if self.a != 1: self.u = (1. - self.a)*0. + self.a*self.u self.s = (1. - self.a)*1. + self.a*self.s self.X = (self.X - self.u) / T.sqrt(self.s + self.e) if self.g is not None and self.b is not None: self.X = self.X*self.g + self.b self.params.append(g);self.params.append(b) else: raise NotImplementedError
def __init__(self, mu=0.5, learning_rate=0.1, n_epochs=40, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500, lam_contractive=0, lam_l2=0.01, temperature=1): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ self.mu = mu self.learning_rate = learning_rate self.n_epochs = n_epochs self.nkerns = nkerns self.batch_size = batch_size self.train_batch_size = batch_size self.datasets = load_data(dataset) self.train_set_x, self.train_set_y = self.datasets[0] self.valid_set_x, self.valid_set_y = self.datasets[1] self.test_set_x, self.test_set_y = self.datasets[2] # compute number of minibatchs for train, valid, test self.n_train_batches = self.train_set_x.get_value(borrow=True).shape[0] self.n_train_batches //= batch_size self.n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0] self.n_valid_batches //= batch_size self.n_test_batches = self.test_set_x.get_value(borrow=True).shape[0] self.n_test_batches //= batch_size # allocate symbolic variables for the data self.index = T.lscalar() # index to a minibatch # start-snippet-1 x = T.matrix('x') y = T.ivector('y') # BUILD ACTUAL MODEL print('... building the model') # Reshape matrix of rasterized image of shape(batch_size, 28* 28) to 4D tensor layer0_input = x.reshape((self.train_batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # Filtering reduces the image size to(28-5+1, 28-5+1) = (24, 24) # maxpooling reduces this further to ( 24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (1, nkerns[0],12,12) self.rng = numpy.random.RandomState(23455) self.layer0 = LeNetConvPoolLayer(self.rng, input=layer0_input, image_shape=(self.train_batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) layer1_input = self.layer0.output layer1_input_flatten = self.layer0.output.flatten(2) # Construct the second convolutional pooling layer # Filtering reduces the image size to (12-5+1, 12-5+1) = (8,8) # maxpooling reduces this further to (8.2, 8/2) = (4,4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) self.layer1 = LeNetConvPoolLayer(self.rng, input=layer1_input, image_shape=(self.train_batch_size, nkerns[0], 12, 12), \ filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # The FC layer. It operates on 2D matrices of shapece (batch_size, volumndepty*num_pixels). This will # generate a matrix of shape (batch_size, nkerns[1] * 4 * 4). # ????Hidden layer units happen to equal to minibatch????? layer2_input = self.layer1.output.flatten(2) self.layer2 = FCLayer(self.rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3_input = self.layer2.output self.layer3 = FCSoftMaxLayer(input=layer3_input, n_in=500, n_out=10, rng=self.rng, temperature=temperature) self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params L2params = [self.layer3.W, self.layer2.W, self.layer1.W, self.layer0.W] self.params_shape = self.layer3.params_shape + self.layer2.params_shape + self.layer1.params_shape + self.layer0.params_shape velocities = self.layer3.velocity + self.layer2.velocity + self.layer1.velocity + self.layer0.velocity paramssum = T.sum(T.sqr(L2params[0])) for i in range(1, len(L2params)): paramssum += T.sum(T.sqr(L2params[i])) y_score_given_x = self.layer3.p_y_given_x #layer3 to x contractive layer3_Jnorm1, _ = theano.scan( lambda ind, yi, y_score_given_x, x: (theano.gradient.jacobian(y_score_given_x[ind, yi], x))[ind, :], sequences=[T.arange(self.train_batch_size), y], non_sequences=[y_score_given_x, x]) cost = self.layer3.negative_log_likelihood(y) testnorm = T.sum( (theano.gradient.jacobian(y_score_given_x[0], layer3_input)[:, 0, :])**2)**0.5 testgrads = T.grad(T.sum(testnorm), self.params) grads = T.grad(cost, self.params) # momentum update updates = [ (param_i, param_i - learning_rate * grad_i + mu * v_i) for param_i, grad_i, v_i in zip(self.params, grads, velocities) ] updates += [(v_i, mu * v_i - learning_rate * grad_i) for grad_i, v_i in zip(grads, velocities)] # create a function to compute the mistakes that are made by the model self.validate_p = theano.function( [self.index], [testnorm] + testgrads, givens={ x: self.valid_set_x[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size] }) self.test_model = theano.function( [self.index], self.layer3.errors(y), givens={ x: self.test_set_x[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size], y: self.test_set_y[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size] }) self.validate_model = theano.function( [self.index], self.layer3.errors(y), givens={ x: self.valid_set_x[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size], y: self.valid_set_y[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size] }) self.train_model = theano.function( [self.index], cost, updates=updates, givens={ x: self.train_set_x[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size], y: self.train_set_y[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size] }) self.test_confidencefunc = theano.function( [self.index], self.layer3.confidence_mean(y), givens={ x: self.test_set_x[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size], y: self.test_set_y[self.index * self.train_batch_size:(self.index + 1) * self.train_batch_size] })
def __init__(self, numpy_rng=numpy.random.RandomState(2**30), theano_rng=None, n_ins=601, n_outs=259, l1_reg=None, l2_reg=None, hidden_layers_sizes=[512, 512, 512, 512, 512, 512, 512], n_speakers_accent=2, hidden_activation='tanh', output_activation='linear'): print "DNN MULTI-SPEAKER INITIALISATION" self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.n_ins = n_ins self.n_outs = n_outs self.output_activation = output_activation self.l1_reg = l1_reg self.l2_reg = l2_reg self.final_layer_accent = [] self.error_cost = [] #finetune_cost = [] #self.finetune_costs_accent = [] self.errors_accent = [] assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy.random.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.matrix('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.tanh) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) ####Final Layer for speaker if self.output_activation == 'linear': self.final_layer_accent = LinearLayer( rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) elif self.output_activation == 'sigmoid': self.final_layer_accent = SigmoidLayer( rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, activation=T.nnet.sigmoid) else: print( "This output activation function: %s is not supported right now!" % (self.output_activation)) sys.exit(1) self.params.extend(self.final_layer_accent.params) self.delta_params.extend(self.final_layer_accent.delta_params) ##MSE FOR EACH SPEAKER self.error_cost = T.mean( T.sum((self.final_layer_accent.output - self.y) * (self.final_layer_accent.output - self.y), axis=1)) ###L1-norm if self.l1_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.error_cost += self.l1_reg * (abs(W).sum()) ###L2-norm if self.l2_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.error_cost += self.l2_reg * T.sqr(W).sum()
def _setup_functions(self, trX): l1_e = (10, trX.shape[1], 5, 5) print("l1_e", l1_e) l1_d = (l1_e[1], l1_e[0], l1_e[2], l1_e[3]) print("l1_d", l1_d) l2_e = (20, l1_e[0], 5, 5) print("l2_e", l2_e) l2_d = (l2_e[1], l2_e[0], l2_e[2], l2_e[3]) print("l2_d", l2_d) # 2 layers means downsample by 2 ** 2 -> 4, with input size 28x28 -> 7x7 # assume square self.downpool_sz_h = trX.shape[-2] / 4 self.downpool_sz_w = trX.shape[-1] / 4 # self.downpool_sz_h = int(np.ceil(trX.shape[-2] / 4.)) # self.downpool_sz_w = int(np.ceil(trX.shape[-1] / 4.)) l3_e = (l2_e[0] * self.downpool_sz_h * self.downpool_sz_w, self.n_hidden) print("l3_e", l3_e) l3_d = (l3_e[1], l3_e[0]) print("l4_d", l3_d) sys.stdout.flush() if not hasattr(self, "params"): print('generating weights') sys.stdout.flush() we = uniform(l1_e) w2e = uniform(l2_e) w3e = uniform(l3_e) b3e = shared0s(self.n_hidden) wmu = uniform((self.n_hidden, self.n_code)) bmu = shared0s(self.n_code) wsigma = uniform((self.n_hidden, self.n_code)) bsigma = shared0s(self.n_code) wd = uniform((self.n_code, self.n_hidden)) bd = shared0s((self.n_hidden)) w2d = uniform(l3_d) b2d = shared0s((l3_d[1])) w3d = uniform(l2_d) wo = uniform(l1_d) self.enc_params = [we, w2e, w3e, b3e, wmu, bmu, wsigma, bsigma] self.dec_params = [wd, bd, w2d, b2d, w3d, wo] self.params = self.enc_params + self.dec_params print('theano code') sys.stdout.flush() X = T.tensor4() e = T.matrix() Z_in = T.matrix() Z_in_1 = T.matrix() Z_in_2 = T.matrix() # encode_mu, encode_sigm = self._conv_gaussian_enc(X, *self.enc_params) #EHA # h2 = self._get_h2(X, *self.enc_params) code_mu, code_log_sigma, Z, y = self._model(X, e) # out_h = self._get_deconv_dec(Z_in, *self.dec_params) y_out = self._deconv_dec(Z_in, *self.dec_params) y_out_1 = self._deconv_dec(Z_in_1, *self.dec_params) y_out_2 = self._deconv_dec(Z_in_2, *self.dec_params) #rec_cost = T.sum(T.abs_(X - y)) rec_cost = T.sum(T.sqr(X - y)) # / T.cast(X.shape[0], 'float32') prior_cost = log_prior(code_mu, code_log_sigma) cost = rec_cost - prior_cost print('getting updates') sys.stdout.flush() updates = Adam(self.params, cost) print('compiling') sys.stdout.flush() # self._encode = theano.function([X], (encode_mu, encode_sigm)) #EHA # self._hidden2 = theano.function([X], h2) # self._get_out_h = theano.function([Z_in], out_h) self._fit_function = theano.function([X, e], cost, updates=updates) self._reconstruct = theano.function([X, e], y) self._x_given_z = theano.function([Z_in], y_out) self._z_given_x = theano.function([X], (code_mu, code_log_sigma)) self._2x_given_2z = theano.function([Z_in_1, Z_in_2], (y_out_1, y_out_2))
def mlp_synthetic(X_train, X_test, y_train, y_test, precision, vy, hWidths, mini_batchsize=10, epochs=1000, display=False): input_size = X_train.shape[1] output_size = y_train.shape[1] X = T.fmatrix(name='X') Y = T.fmatrix(name='Y') rng = numpy.random.RandomState(123) dim = find_dim_theta(hWidths, input_size, output_size) input_size = X_train.shape[1] initial_params = theano.shared( floatX(rng.randn(1, dim).astype(theano.config.floatX))) params = initial_params op = model(X, params, hWidths, input_size, output_size) cost = T.sum(T.sqr(op - Y)) * (vy * 0.5) + T.sum( T.sqr(params)) * (precision * 0.5) updates = sgd(cost, params, lr=0.000001) # updates=Adam(cost,params) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True, name='train') predict = theano.function(inputs=[X], outputs=op, allow_input_downcast=True) fcost = theano.function(inputs=[op, Y], outputs=cost, allow_input_downcast=True) test_costs = [] train_costs = [] for i in range(epochs): for start, end in zip( range(0, len(X_train), mini_batchsize), range(mini_batchsize, len(X_train), mini_batchsize)): yd = (floatX(y_train[start:end])).reshape(mini_batchsize, 1) cost_v = train(X_train[start:end], yd) # Done this cost prediction needs to change # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1)) # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1)) fin_cost_test = MSE(predict(X_test), y_test) fin_cost_train = MSE(predict(X_train), y_train) test_costs.append(fin_cost_test) train_costs.append(fin_cost_train) # print i, fin_cost_test, fin_cost_train final_params = params.get_value() # print final_params print type(final_params) print final_params.shape # print 'final b_o values' # print b_o.get_value() # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1)) # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1)) fin_cost_test = MSE(predict(X_test), y_test) fin_cost_train = MSE(predict(X_train), y_train) print 'vy: {}, prec: {}, Train: {}, Test: {}'.format( vy, precision, fin_cost_train, fin_cost_test) # Calculate RMS error with simple mean prediction test_mean = np.mean(y_test) train_mean = np.mean(y_train) mean_p_test = np.ones(y_test.size) * test_mean mean_p_train = np.ones(y_train.size) * train_mean # test_cost=fcost(floatX(mean_p_test).reshape(len(y_test), 1), floatX(y_test).reshape(len(y_test), 1)) # train_cost=fcost(floatX(mean_p_train).reshape(len(y_train), 1), floatX(y_train).reshape(len(y_train), 1)) mean_pred_test_cost = MSE(mean_p_test, y_test) mean_pred_train_cost = MSE(mean_p_train, y_train) tArray = np.ones(epochs) * mean_pred_test_cost if (display): print 'MSE for mean prediction, Train:{} ,Test:{}'.format( mean_pred_train_cost, mean_pred_test_cost) plt.plot(range(epochs), test_costs, label='Test') plt.plot(range(epochs), train_costs, label='Train') # plt.plot(range(epochs), tArray, label='Reference',color='black',linewidth=1.6) plt.xlabel('Epochs') plt.ylabel('Error') plt.legend() # plt.title('TrainCost:{}, TestCost: {}, Ref: {}'.format(fin_cost_train, fin_cost_test, mean_pred_test_cost)) return fin_cost_train, fin_cost_test, final_params
def log_likelihood_samplesImean_sigma(self, samples, mean, sigma): return -log2pi*T.cast(samples.shape[1], floatX) / 2 - \ T.sum(T.sqr((samples-mean)/sigma) + 2*T.log(sigma), axis=1) / 2
def log_likelihood_samples(self, samples): '''Given samples as rows of a matrix, returns their log-likelihood under the zero mean unit covariance Gaussian as a vector''' return -log2pi * T.cast(samples.shape[1], floatX) / 2 - T.sum( T.sqr(samples), axis=1) / 2
def __init__(self, config_path, dnn): """ Initializate class give either a filename or a model Usually this method will load a model from disk and store internally, but model can also be provided directly instead (useful when training) """ config_module = imp.load_source('config', config_path) self.cfg = config_module.cfg self.weights_fname = str(config_path)[:-3] + '.npz' self.model = config_module.get_model(dnn=dnn) # Load weights print('(inside init of IAN)') print('Loading weights') params = list(set(lasagne.layers.get_all_params(self.model['l_out'],trainable=True)+\ lasagne.layers.get_all_params(self.model['l_discrim'],trainable=True)+\ [x for x in lasagne.layers.get_all_params(self.model['l_out'])+\ lasagne.layers.get_all_params(self.model['l_discrim'])\ if x.name[-4:]=='mean' or x.name[-7:]=='inv_std'])) print('params = {}'.format(params)) GANcheckpoints.load_weights(self.weights_fname, params) # Shuffle weights if using IAF with MADE if 'l_IAF_mu' in self.model: print('Shuffling MADE masks') self.model['l_IAF_mu'].reset("Once") self.model['l_IAF_ls'].reset("Once") print('Compiling Theano Functions') # Input Tensor self.X = T.TensorType('float32', [False] * 4)('X') # Latent Vector self.Z = T.TensorType('float32', [False] * 2)('Z') # X_hat(Z) self.X_hat = lasagne.layers.get_output(self.model['l_out'], {self.model['l_Z']: self.Z}, deterministic=True) print('self.X_hat = {}'.format(self.X_hat)) self.X_hat_fn = theano.function([self.Z], self.X_hat) # Z_hat(X) self.Z_hat = lasagne.layers.get_output(self.model['l_Z'], {self.model['l_in']: self.X}, deterministic=True) print('self.Z_hat = {}'.format(self.Z_hat)) self.Z_hat_fn = theano.function([self.X], self.Z_hat) # Imgrad Functions r1, r2 = T.scalar('r1', dtype='int32'), T.scalar('r2', dtype='int32') c1, c2 = T.scalar('c', dtype='int32'), T.scalar('c2', dtype='int32') RGB = T.tensor4('RGB', dtype='float32') # Image Gradient Function, evaluates the change in latents which would lighten the image in the local area self.calculate_lighten_gradient = theano.function( [c1, r1, c2, r2, self.Z], T.grad(T.mean(self.X_hat[0, :, r1:r2, c1:c2]), self.Z)) # Image Color Gradient Function, evaluates the change in latents which would push the image towards the local desired RGB value # Consider changing this to only take in a smaller RGB array, rather than a full-sized, indexed RGB array. # Also consider using the L1 loss instead of L2 self.calculate_RGB_gradient = theano.function( [c1, r1, c2, r2, RGB, self.Z], T.grad( T.mean((T.sqr(-self.X_hat[0, :, r1:r2, c1:c2] + RGB[0, :, r1:r2, c1:c2]))), self.Z)) # may need a T.mean
def normal_lcdf(mu, sigma, x): """Compute the log of the cumulative density function of the normal.""" z = (x - mu) / sigma return tt.switch(tt.lt(z, -1.0), tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2., tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.))
def main(): # Parameters task = 'cifar10' name = '0' begin_save = 0 input_nc = 3 loss_type = ['trickLogD', 'minimax', 'ls'] nloss = 3 shuffle_ = True batchSize = 32 fineSize = 32 flip = True ncandi = 1 # # of survived childern kD = 3 # # of discrim updates for each gen update kG = 1 # # of discrim updates for each gen update ntf = 256 b1 = 0.5 # momentum term of adam nz = 100 # # of dim for Z ngf = 128 # # of gen filters in first conv layer ndf = 128 # # of discrim filters in first conv layer niter = 100 # # of iter at starting learning rate lr = 0.0002 # initial learning rate for adam G lrd = 0.0002 # initial learning rate for adam D beta = 0.002 # hyperparameter of fitness function GP_norm = False # wheather apply gradients penatly on discriminator LAMBDA = 2. # hyperparameter of GP term save_freq = 1000 show_freq = 1000 # Check if cifar data exists if not os.path.exists("./cifar-10-batches-py"): print( "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'." ) return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] ################## MODEL D ####################### print("Building model and compiling functions...") # Prepare Theano variables for inputs and targets real_imgs = T.tensor4('real_imgs') fake_imgs = T.tensor4('fake_imgs') # Create neural network model discriminator = models_uncond.build_discriminator_32(ndf=ndf) # Create expression for passing real data through the discriminator real_out = lasagne.layers.get_output(discriminator, real_imgs) # Create expression for passing fake data through the discriminator fake_out = lasagne.layers.get_output(discriminator, fake_imgs) # Create loss expressions discriminator_loss = ( lasagne.objectives.binary_crossentropy(real_out, 1) + lasagne.objectives.binary_crossentropy(fake_out, 0)).mean() # Gradients penalty norm if GP_norm is True: alpha = t_rng.uniform((batchSize, 1, 1, 1), low=0., high=1.) differences = fake_imgs - real_imgs interpolates = real_imgs + (alpha * differences) gradients = theano.grad(lasagne.layers.get_output( discriminator, interpolates).sum(), wrt=interpolates) slopes = T.sqrt(T.sum(T.sqr(gradients), axis=(1, 2, 3))) gradient_penalty = T.mean((slopes - 1.)**2) D_loss = discriminator_loss + LAMBDA * gradient_penalty b1_d = 0. else: D_loss = discriminator_loss b1_d = b1 # Create update expressions for training discriminator_params = lasagne.layers.get_all_params(discriminator, trainable=True) lrtd = theano.shared(lasagne.utils.floatX(lrd)) updates_d = lasagne.updates.adam(D_loss, discriminator_params, learning_rate=lrtd, beta1=b1_d) lrt = theano.shared(lasagne.utils.floatX(lr)) # Diversity fitnees Fd = theano.gradient.grad(discriminator_loss, discriminator_params) Fd_score = beta * T.log(sum(T.sum(T.sqr(x)) for x in Fd)) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_d = theano.function([real_imgs, fake_imgs], discriminator_loss, updates=updates_d) # Compile another function generating some data disft_fn = theano.function([real_imgs, fake_imgs], [(real_out).mean(), (fake_out).mean(), Fd_score]) # Launch the training loop. print("Starting training...") desc = task + '_' + name print desc if not os.path.isdir('logs'): os.mkdir(os.path.join('logs')) f_log = open('logs/%s.ndjson' % desc, 'wb') if not os.path.isdir('samples'): os.mkdir(os.path.join('samples/')) if not os.path.isdir('samples/' + desc): os.mkdir(os.path.join('samples/', desc)) if not os.path.isdir('models'): os.mkdir(os.path.join('models/')) if not os.path.isdir('models/' + desc): os.mkdir(os.path.join('models/', desc)) gen_new_params = [] n_updates = 0 # We iterate over epochs: for epoch in range(niter): if shuffle_ is True: X_train = shuffle(X_train) for xmb in iter_data(X_train, size=batchSize * kD): # For measure fitness score sample_xmb = floatX(X_train[np_rng.randint(0, 50000, ncandi * ntf), :, :, :]) # initial G cluster if epoch + n_updates == 0: for can_i in range(0, ncandi): train_g_, gen_fn_, generator_ = create_G( loss_type=loss_type[can_i % nloss], discriminator=discriminator, lr=lr, b1=b1, ngf=ngf) for _ in range(0, kG): zmb = floatX( np_rng.uniform(-1., 1., size=(batchSize, nz))) cost = train_g_(zmb) sample_zmb = floatX(np_rng.uniform(-1., 1., size=(ntf, nz))) gen_imgs = gen_fn_(sample_zmb) gen_new_params.append( lasagne.layers.get_all_param_values(generator_)) if can_i == 0: g_imgs_old = gen_imgs fmb = gen_imgs[0:batchSize / ncandi * kD, :, :, :] else: g_imgs_old = np.append(g_imgs_old, gen_imgs, axis=0) fmb = np.append(fmb, gen_imgs[0:batchSize / ncandi * kD, :, :, :], axis=0) ######## MODEL G ######## noise = T.matrix('noise') generator = models_uncond.build_generator_32(noise, ngf=ngf) Tgimgs = lasagne.layers.get_output(generator) Tfake_out = lasagne.layers.get_output(discriminator, Tgimgs) g_loss_logD = lasagne.objectives.binary_crossentropy( Tfake_out, 1).mean() g_loss_minimax = -lasagne.objectives.binary_crossentropy( Tfake_out, 0).mean() g_loss_ls = T.mean(T.sqr((Tfake_out - 1))) g_params = lasagne.layers.get_all_params(generator, trainable=True) up_g_logD = lasagne.updates.adam(g_loss_logD, g_params, learning_rate=lrt, beta1=b1) up_g_minimax = lasagne.updates.adam(g_loss_minimax, g_params, learning_rate=lrt, beta1=b1) up_g_ls = lasagne.updates.adam(g_loss_ls, g_params, learning_rate=lrt, beta1=b1) train_g_logD = theano.function([noise], g_loss_logD, updates=up_g_logD) train_g_minimax = theano.function([noise], g_loss_minimax, updates=up_g_minimax) train_g_ls = theano.function([noise], g_loss_ls, updates=up_g_ls) gen_fn = theano.function([noise], lasagne.layers.get_output( generator, deterministic=True)) else: gen_old_params = gen_new_params for can_i in range(0, ncandi): for type_i in range(0, nloss): lasagne.layers.set_all_param_values( generator, gen_old_params[can_i]) if loss_type[type_i] == 'trickLogD': for _ in range(0, kG): zmb = floatX( np_rng.uniform(-1., 1., size=(batchSize, nz))) cost = train_g_logD(zmb) elif loss_type[type_i] == 'minimax': for _ in range(0, kG): zmb = floatX( np_rng.uniform(-1., 1., size=(batchSize, nz))) cost = train_g_minimax(zmb) elif loss_type[type_i] == 'ls': for _ in range(0, kG): zmb = floatX( np_rng.uniform(-1., 1., size=(batchSize, nz))) cost = train_g_ls(zmb) sample_zmb = floatX( np_rng.uniform(-1., 1., size=(ntf, nz))) gen_imgs = gen_fn(sample_zmb) _, fr_score, fd_score = disft_fn( sample_xmb[0:ntf], gen_imgs) fit = fr_score - fd_score if can_i * nloss + type_i < ncandi: idx = can_i * nloss + type_i gen_new_params[ idx] = lasagne.layers.get_all_param_values( generator) fitness[idx] = fit fake_rate[idx] = fr_score g_imgs_old[idx * ntf:(idx + 1) * ntf, :, :, :] = gen_imgs fmb[idx*batchSize/ncandi*kD:(idx+1)*batchSize/ncandi*kD,:,:,:] = \ gen_imgs[0:batchSize/ncandi*kD,:,:,:] else: fit_com = fitness - fit if min(fit_com) < 0: ids_replace = np.where(fit_com == min(fit_com)) idr = ids_replace[0][0] fitness[idr] = fit fake_rate[idr] = fr_score gen_new_params[ idr] = lasagne.layers.get_all_param_values( generator) g_imgs_old[idr * ntf:(idr + 1) * ntf, :, :, :] = gen_imgs fmb[idr*batchSize/ncandi*kD:(idr+1)*batchSize/ncandi*kD,:,:,:] = \ gen_imgs[0:batchSize/ncandi*kD,:,:,:] print fake_rate, fitness f_log.write( str(fake_rate) + ' ' + str(fd_score) + ' ' + str(fitness) + '\n') # train D for xreal, xfake in iter_data(xmb, shuffle(fmb), size=batchSize): cost = train_d(xreal, xfake) for i in range(0, ncandi): xfake = g_imgs_old[i * ntf:(i + 1) * ntf, :, :, :] xreal = sample_xmb[i * ntf:(i + 1) * ntf, :, :, :] tr, fr, fd = disft_fn(xreal, xfake) if i == 0: fake_rate = np.array([fr]) fitness = np.array([0.]) real_rate = np.array([tr]) FDL = np.array([fd]) else: fake_rate = np.append(fake_rate, fr) fitness = np.append(fitness, [0.]) real_rate = np.append(real_rate, tr) FDL = np.append(FDL, fd) print fake_rate, FDL print(n_updates, epoch, real_rate.mean()) f_log.write( str(fake_rate) + ' ' + str(FDL) + '\n' + str(epoch) + ' ' + str(n_updates) + ' ' + str(real_rate.mean()) + '\n') f_log.flush() if n_updates % show_freq == 0: blank_image = Image.new("RGB", (fineSize * 8 + 9, fineSize * 8 + 9)) for i in range(8): for ii in range(8): img = g_imgs_old[i * 8 + ii, :, :, :] img = ImgRescale(img, center=True, scale=True, convert_back=True) blank_image.paste( Image.fromarray(img), (ii * fineSize + ii + 1, i * fineSize + i + 1)) blank_image.save('samples/%s/%s_%d.png' % (desc, desc, n_updates / save_freq)) if n_updates % save_freq == 0 and n_updates > begin_save - 1: # Optionally, you could now dump the network weights to a file like this: np.savez( 'models/%s/gen_%d.npz' % (desc, n_updates / save_freq), *lasagne.layers.get_all_param_values(generator)) np.savez( 'models/%s/dis_%d.npz' % (desc, n_updates / save_freq), *lasagne.layers.get_all_param_values(discriminator)) n_updates += 1
def normal_lccdf(mu, sigma, x): z = (x - mu) / sigma return tt.switch(tt.gt(z, 1.0), tt.log(tt.erfcx(z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2., tt.log1p(-tt.erfc(-z / tt.sqrt(2.)) / 2.))
def fn(images): return T.sum(T.sqr(images2neibs(images, (2, 2), mode='valid')), axis=[0, 1])
def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7): """Max weight norm constraints and gradient clipping This takes a TensorVariable and rescales it so that incoming weight norms are below a specified constraint value. Vectors violating the constraint are rescaled so that they are within the allowed range. Parameters ---------- tensor_var : TensorVariable Theano expression for update, gradient, or other quantity. max_norm : scalar This value sets the maximum allowed value of any norm in `tensor_var`. norm_axes : sequence (list or tuple) The axes over which to compute the norm. This overrides the default norm axes defined for the number of dimensions in `tensor_var`. When this is not specified and `tensor_var` is a matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or 5D tensor, it is set to a tuple listing all axes but axis 0. The former default is useful for working with dense layers, the latter is useful for 1D, 2D and 3D convolutional layers. (Optional) epsilon : scalar, optional Value used to prevent numerical instability when dividing by very small or zero norms. Returns ------- TensorVariable Input `tensor_var` with rescaling applied to weight vectors that violate the specified constraints. Examples -------- >>> param = theano.shared( ... np.random.randn(100, 200).astype(theano.config.floatX)) >>> update = param + 100 >>> update = norm_constraint(update, 10) >>> func = theano.function([], [], updates=[(param, update)]) >>> # Apply constrained update >>> _ = func() >>> from lasagne.utils import compute_norms >>> norms = compute_norms(param.get_value()) >>> np.isclose(np.max(norms), 10) True Notes ----- When `norm_axes` is not specified, the axes over which the norm is computed depend on the dimensionality of the input variable. If it is 2D, it is assumed to come from a dense layer, and the norm is computed over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a convolutional layer and the norm is computed over all trailing axes beyond axis 0. For other uses, you should explicitly specify the axes over which to compute the norm using `norm_axes`. """ ndim = tensor_var.ndim if norm_axes is not None: sum_over = tuple(norm_axes) elif ndim == 2: # DenseLayer sum_over = (0, ) elif ndim in [3, 4, 5]: # Conv{1,2,3}DLayer sum_over = tuple(range(1, ndim)) else: raise ValueError("Unsupported tensor dimensionality {}." "Must specify `norm_axes`".format(ndim)) dtype = np.dtype(theano.config.floatX).type norms = T.sqrt(T.sum(T.sqr(tensor_var), axis=sum_over, keepdims=True)) target_norms = T.clip(norms, 0, dtype(max_norm)) constrained_output = \ (tensor_var * (target_norms / (dtype(epsilon) + norms))) return constrained_output
def log_likelihood(tgt, mu, ls): return T.sum(-(np.float32(0.5 * np.log(2 * np.pi)) + ls) - 0.5 * T.sqr(tgt - mu) / T.exp(2 * ls))
def __init__(self, data, U, img_h=160, img_w=300, hidden_size=100, batch_size=50, lr=0.001, lr_decay=0.95, sqr_norm_lim=9, fine_tune_W=True, fine_tune_M=False, optimizer='adam', filter_sizes=[3, 4, 5], num_filters=100, conv_attn=False, encoder='rnn', elemwise_sum=True, corr_penalty=0.0, xcov_penalty=0.0, n_recurrent_layers=1, is_bidirectional=False): self.data = data self.img_h = img_h self.batch_size = batch_size self.fine_tune_W = fine_tune_W self.fine_tune_M = fine_tune_M self.lr = lr self.lr_decay = lr_decay self.optimizer = optimizer self.sqr_norm_lim = sqr_norm_lim self.conv_attn = conv_attn index = T.iscalar() c = T.imatrix('c') r = T.imatrix('r') y = T.ivector('y') c_mask = T.fmatrix('c_mask') r_mask = T.fmatrix('r_mask') c_seqlen = T.ivector('c_seqlen') r_seqlen = T.ivector('r_seqlen') embeddings = theano.shared(U, name='embeddings', borrow=True) zero_vec_tensor = T.fvector() self.zero_vec = np.zeros(img_w, dtype=theano.config.floatX) self.set_zero = theano.function([zero_vec_tensor], updates=[(embeddings, T.set_subtensor( embeddings[0, :], zero_vec_tensor))]) if encoder.find('cnn') > -1 and ( encoder.find('rnn') > -1 or encoder.find('lstm') > -1) and not elemwise_sum: self.M = theano.shared(np.eye(2 * hidden_size).astype( theano.config.floatX), borrow=True) else: self.M = theano.shared(np.eye(hidden_size).astype( theano.config.floatX), borrow=True) c_input = embeddings[c.flatten()].reshape( (c.shape[0], c.shape[1], embeddings.shape[1])) r_input = embeddings[r.flatten()].reshape( (r.shape[0], r.shape[1], embeddings.shape[1])) l_in = lasagne.layers.InputLayer(shape=(batch_size, img_h, img_w)) if encoder.find('cnn') > -1: l_conv_in = lasagne.layers.ReshapeLayer(l_in, shape=(batch_size, 1, img_h, img_w)) conv_layers = [] for filter_size in filter_sizes: conv_layer = lasagne.layers.Conv2DLayer( l_conv_in, num_filters=num_filters, filter_size=(filter_size, img_w), stride=(1, 1), nonlinearity=lasagne.nonlinearities.rectify, border_mode='valid') pool_layer = lasagne.layers.MaxPool2DLayer( conv_layer, pool_size=(img_h - filter_size + 1, 1)) conv_layers.append(pool_layer) l_conv = lasagne.layers.ConcatLayer(conv_layers) l_conv = lasagne.layers.DenseLayer( l_conv, num_units=hidden_size, nonlinearity=lasagne.nonlinearities.tanh) if is_bidirectional: if encoder.find('lstm') > -1: prev_fwd, prev_bck = l_in, l_in for _ in xrange(n_recurrent_layers): l_fwd = lasagne.layers.LSTMLayer(prev_fwd, hidden_size, backwards=False, learn_init=True, peepholes=True) l_bck = lasagne.layers.LSTMLayer(prev_bck, hidden_size, backwards=True, learn_init=True, peepholes=True) prev_fwd, prev_bck = l_fwd, l_bck else: prev_fwd, prev_bck = l_in, l_in for _ in xrange(n_recurrent_layers): l_fwd = lasagne.layers.RecurrentLayer( prev_fwd, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=False, learn_init=True) l_bck = lasagne.layers.RecurrentLayer( prev_bck, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=True, learn_init=True) prev_fwd, prev_bck = l_fwd, l_bck l_recurrent = lasagne.layers.ConcatLayer([l_fwd, l_bck]) else: prev_fwd = l_in if encoder.find('lstm') > -1: for _ in xrange(n_recurrent_layers): l_recurrent = lasagne.layers.LSTMLayer(prev_fwd, hidden_size, backwards=False, learn_init=True, peepholes=True) prev_fwd = l_recurrent else: for _ in xrange(n_recurrent_layers): l_recurrent = lasagne.layers.RecurrentLayer( prev_fwd, hidden_size, nonlinearity=lasagne.nonlinearities.tanh, W_hid_to_hid=lasagne.init.Orthogonal(), W_in_to_hid=lasagne.init.Orthogonal(), backwards=False, learn_init=True) prev_fwd = l_recurrent recurrent_size = hidden_size * 2 if is_bidirectional else hidden_size if conv_attn: l_rconv_in = lasagne.layers.InputLayer(shape=(batch_size, img_h, recurrent_size)) l_rconv_in = lasagne.layers.ReshapeLayer(l_rconv_in, shape=(batch_size, 1, img_h, recurrent_size)) conv_layers = [] for filter_size in filter_sizes: conv_layer = lasagne.layers.Conv2DLayer( l_rconv_in, num_filters=num_filters, filter_size=(filter_size, recurrent_size), stride=(1, 1), nonlinearity=lasagne.nonlinearities.rectify, border_mode='valid') pool_layer = lasagne.layers.MaxPool2DLayer( conv_layer, pool_size=(img_h - filter_size + 1, 1)) conv_layers.append(pool_layer) l_hidden1 = lasagne.layers.ConcatLayer(conv_layers) l_hidden2 = lasagne.layers.DenseLayer( l_hidden1, num_units=hidden_size, nonlinearity=lasagne.nonlinearities.tanh) l_out = l_hidden2 else: l_out = l_recurrent if conv_attn: e_context = l_recurrent.get_output(c_input, mask=c_mask, deterministic=False) e_response = l_recurrent.get_output(r_input, mask=r_mask, deterministic=False) def step_fn(row_t, mask_t): return row_t * mask_t.reshape((-1, 1)) if is_bidirectional: e_context, _ = theano.scan(step_fn, outputs_info=None, sequences=[ e_context, T.concatenate([c_mask, c_mask], axis=1) ]) e_response, _ = theano.scan(step_fn, outputs_info=None, sequences=[ e_response, T.concatenate([r_mask, r_mask], axis=1) ]) else: e_context, _ = theano.scan(step_fn, outputs_info=None, sequences=[e_context, c_mask]) e_response, _ = theano.scan(step_fn, outputs_info=None, sequences=[e_response, r_mask]) e_context = l_out.get_output(e_context, mask=c_mask, deterministic=False) e_response = l_out.get_output(e_response, mask=r_mask, deterministic=False) else: e_context = l_out.get_output( c_input, mask=c_mask, deterministic=False)[T.arange(batch_size), c_seqlen].reshape( (c.shape[0], hidden_size)) e_response = l_out.get_output( r_input, mask=r_mask, deterministic=False)[T.arange(batch_size), r_seqlen].reshape( (r.shape[0], hidden_size)) if encoder.find('cnn') > -1: e_conv_context = l_conv.get_output(c_input, deterministic=False) e_conv_response = l_conv.get_output(r_input, deterministic=False) if encoder.find('rnn') > -1 or encoder.find('lstm') > -1: if elemwise_sum: e_context = e_context + e_conv_context e_response = e_response + e_conv_response else: e_context = T.concatenate([e_context, e_conv_context], axis=1) e_response = T.concatenate([e_response, e_conv_response], axis=1) # penalize correlation if abs(corr_penalty) > 0: cor = [] for i in range(hidden_size if elemwise_sum else 2 * hidden_size): y1, y2 = e_context, e_response x1 = y1[:, i] - (np.ones(batch_size) * (T.sum(y1[:, i]) / batch_size)) x2 = y2[:, i] - (np.ones(batch_size) * (T.sum(y2[:, i]) / batch_size)) nr = T.sum(x1 * x2) / (T.sqrt(T.sum(x1 * x1)) * T.sqrt(T.sum(x2 * x2))) cor.append(-nr) if abs(xcov_penalty) > 0: e_context_mean = T.mean(e_context, axis=0, keepdims=True) e_response_mean = T.mean(e_response, axis=0, keepdims=True) e_context_centered = e_context - e_context_mean # (n, i) e_response_centered = e_response - e_response_mean # (n, j) outer_prod = (e_context_centered.dimshuffle(0, 1, 'x') * e_response_centered.dimshuffle(0, 'x', 1) ) # (n, i, j) xcov = T.sum(T.sqr(T.mean(outer_prod, axis=0))) else: e_context = e_conv_context e_response = e_conv_response dp = T.batched_dot(e_context, T.dot(e_response, self.M.T)) #dp = pp('dp')(dp) o = T.nnet.sigmoid(dp) o = T.clip(o, 1e-7, 1.0 - 1e-7) self.shared_data = {} for key in ['c', 'r']: self.shared_data[key] = theano.shared( np.zeros((batch_size, img_h), dtype=np.int32)) for key in ['c_mask', 'r_mask']: self.shared_data[key] = theano.shared( np.zeros((batch_size, img_h), dtype=theano.config.floatX)) for key in ['y', 'c_seqlen', 'r_seqlen']: self.shared_data[key] = theano.shared( np.zeros((batch_size, ), dtype=np.int32)) self.probas = T.concatenate([(1 - o).reshape( (-1, 1)), o.reshape((-1, 1))], axis=1) self.pred = T.argmax(self.probas, axis=1) self.errors = T.sum(T.neq(self.pred, y)) self.cost = T.nnet.binary_crossentropy(o, y).mean() if encoder.find('cnn') > -1 and (encoder.find('rnn') > -1 or encoder.find('lstm') > -1): if abs(corr_penalty) > 0: self.cost += corr_penalty * T.sum(cor) if abs(xcov_penalty) > 0: self.cost += xcov_penalty * xcov self.l_out = l_out self.l_recurrent = l_recurrent self.embeddings = embeddings self.c = c self.r = r self.y = y self.c_seqlen = c_seqlen self.r_seqlen = r_seqlen self.c_mask = c_mask self.r_mask = r_mask self.update_params()
def diag_normal_nll(z, z_mu, z_log_sigma): nll = 0.5 * T.sum(z_log_sigma, axis=1) + \ T.sum(T.sqr((z - z_mu) / (1e-6 + T.exp(z_log_sigma))), axis=1) / 2. return nll
def __init__(self, numpy_rng, theano_rng=None, cfg = None, testing = False, input = None): self.cfg = cfg self.params = [] self.delta_params = [] self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.layers = [] self.conv_layers = [] self.lstm_layers = [] self.fc_layers = [] # 1. conv self.conv_layer_configs = cfg.conv_layer_configs self.conv_activation = cfg.conv_activation self.conv_layers_number = len(self.conv_layer_configs) self.use_fast = cfg.use_fast # 2. lstm self.lstm_layers_sizes = cfg.lstm_layers_sizes self.lstm_layers_number = len(self.lstm_layers_sizes) # 3. dnn self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.matrix('y') ####################### # build conv layers # ####################### print '1. start to build conv layer: '+ str(self.conv_layers_number) for i in xrange(self.conv_layers_number): if i == 0: input = self.x else: input = self.conv_layers[-1].output config = self.conv_layer_configs[i] conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input, input_shape = config['input_shape'], filter_shape = config['filter_shape'], poolsize = config['poolsize'], activation = self.conv_activation, flatten = config['flatten'], use_fast = self.use_fast, testing = testing) print '\tbuild conv layer: ' +str(config['input_shape']) self.layers.append(conv_layer) self.conv_layers.append(conv_layer) self.params.extend(conv_layer.params) self.delta_params.extend(conv_layer.delta_params) self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] print '\t cnn out: '+ str(self.conv_output_dim) cfg.n_ins = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3] print '1. finish conv layer: '+ str(self.layers[-1].n_out) ####################### # build lstm layers # ####################### print '2. start to build lstm layer: '+ str(self.lstm_layers_number) for i in xrange(self.lstm_layers_number): if i == 0: input_size = self.conv_output_dim input = self.layers[-1].output else: input_size = self.lstm_layers_sizes[i - 1] input = self.layers[-1].output print 'build lstm layer: ' + str(input_size) lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i]) print '\tbuild lstm layer: ' + str(input_size) +' x '+ str(lstm_layer.n_out) self.layers.append(lstm_layer) self.lstm_layers.append(lstm_layer) self.params.extend(lstm_layer.params) self.delta_params.extend(lstm_layer.delta_params) print '2. finish lstm layer: '+ str(self.layers[-1].n_out) ####################### # build dnnv layers # ####################### print '3. start to build dnnv layer: '+ str(self.hidden_layers_number) for i in xrange(self.hidden_layers_number): if i == 0: input_size = self.layers[-1].n_out else: input_size = self.hidden_layers_sizes[i - 1] input = self.layers[-1].output fc_layer = HiddenLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.hidden_layers_sizes[i]) print '\tbuild dnnv layer: ' + str(input_size) +' x '+ str(fc_layer.n_out) self.layers.append(fc_layer) self.fc_layers.append(fc_layer) self.params.extend(fc_layer.params) self.delta_params.extend(fc_layer.delta_params) print '3. finish dnnv layer: '+ str(self.layers[-1].n_out) ####################### # build log layers # ####################### print '4. start to build log layer: 1' input_size = self.layers[-1].n_out input = self.layers[-1].output logLayer = OutputLayer(input=input, n_in=input_size, n_out=self.n_outs) print '\tbuild final layer: ' + str(input_size) +' x '+ str(fc_layer.n_out) self.layers.append(logLayer) self.params.extend(logLayer.params) self.delta_params.extend(logLayer.delta_params) print '4. finish log layer: '+ str(self.layers[-1].n_out) print 'Total layers: '+ str(len(self.layers)) sys.stdout.flush() self.finetune_cost = self.layers[-1].l2(self.y) self.errors = self.layers[-1].errors(self.y) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def get_local_cost(self): er = T.sqr(self.S - T.dot(self.X, self.W)).sum() l1 = T.sqrt(T.sqr(self.X) + 1e-6).sum() top_down = self.get_top_down_flow() return er + .1 * l1 + top_down
def train_batch(self, dataset, batch_size): """ .. todo:: WRITEME """ #TODO-- this results in compilation happening every time learn is # called should cache the compilation results, including those # inside cg X = dataset.get_design_matrix() m = X.shape[0] assert X.shape[1] == self.nvis gamma = N.zeros((batch_size, self.nhid)) cur_gamma = T.vector(name='cur_gamma') cur_v = T.vector(name='cur_v') recons = T.dot(cur_gamma, self.W) recons.name = 'recons' recons_diffs = cur_v - recons recons_diffs.name = 'recons_diffs' recons_diff_sq = T.sqr(recons_diffs) recons_diff_sq.name = 'recons_diff' recons_error = T.sum(recons_diff_sq) recons_error.name = 'recons_error' dict_dists = T.sum(T.sqr(self.W - cur_v), axis=1) dict_dists.name = 'dict_dists' abs_gamma = abs(cur_gamma) abs_gamma.name = 'abs_gamma' weighted_dists = T.dot(abs_gamma, dict_dists) weighted_dists.name = 'weighted_dists' penalty = self.coeff * weighted_dists penalty.name = 'penalty' #prevent directions of absolute flatness in the hessian #W_sq = T.sqr(self.W) #W_sq.name = 'W_sq' #debug = T.sum(W_sq) debug = 1e-10 * T.sum(dict_dists) debug.name = 'debug' #J = debug J = recons_error + penalty + debug J.name = 'J' Jf = function([cur_v, cur_gamma], J) start = self.rng.randint(m - batch_size + 1) batch_X = X[start:start + batch_size, :] #TODO-- optimize gamma logger.info('optimizing gamma') for i in xrange(batch_size): #print str(i+1)+'/'+str(batch_size) gamma[i, :] = self.optimize_gamma(batch_X[i, :]) logger.info('max min') logger.info(N.abs(gamma).min(axis=0).max()) logger.info('min max') logger.info(N.abs(gamma).max(axis=0).max()) #Optimize W logger.info('optimizing W') logger.warning("not tested since switching to Razvan's all-theano " "implementation of linear cg") cg.linear_cg(J, [self.W], max_iters=3) err = 0. for i in xrange(batch_size): err += Jf(batch_X[i, :], gamma[i, :]) assert not N.isnan(err) assert not N.isinf(err) logger.info('err: {0}'.format(err)) return True
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): #sc = abs(self.Xout).sum() #Get last local_error get_local_error() #le = self.local_reconstruction_error W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_min = row_norms.min() row_norms_min.__doc__ = ("The smallest norm of any row of the " "weight matrix W. This is a measure of the " "least influence any visible unit has.") ''' rval = OrderedDict([('row_norms_min', row_norms_min), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max())])#, #('sparse_code_l1_norm', sc.mean())]) ''' rval = OrderedDict() if False: #(state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) P = state #if self.pool_size == 1: vars_and_prefixes = [(P, '')] #else: # vars_and_prefixes = [(P, 'p_')] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min # max_x.mean_u is "the mean over *u*nits of the max over # e*x*amples" The x and u are included in the name because # otherwise its hard to remember which axis is which when # reading the monitor I use inner.outer # rather than outer_of_inner or # something like that because I want mean_x.* to appear next to # each other in the alphabetical list, as these are commonly # plotted together for key, val in [('max_x.max_u', v_max.max()), ('max_x.mean_u', v_max.mean()), ('max_x.min_u', v_max.min()), ('min_x.max_u', v_min.max()), ('min_x.mean_u', v_min.mean()), ('min_x.min_u', v_min.min()), ('range_x.max_u', v_range.max()), ('range_x.mean_u', v_range.mean()), ('range_x.min_u', v_range.min()), ('mean_x.max_u', v_mean.max()), ('mean_x.mean_u', v_mean.mean()), ('mean_x.min_u', v_mean.min())]: rval[prefix+key] = val return rval
def create_updates(self, params, verbose=1): """ This basically creates all the updates and update functions which trainers can iterate upon. Args: params: Supply learnable active parameters of a network. objective: supply a theano graph connecting the params to a loss verbose: Just as always """ # accumulate velocities for momentum if verbose >= 3: print "... creating internal parameters for all the optimizations" velocities = [] for param in params: velocity = theano.shared( numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) velocities.append(velocity) # these are used for second order optimizers. accumulator_1 = [] accumulator_2 = [] for param in params: eps = numpy.zeros_like(param.get_value(borrow=True), dtype=theano.config.floatX) accumulator_1.append(theano.shared(eps, borrow=True)) accumulator_2.append(theano.shared(eps, borrow=True)) # these are used for adam. timestep = theano.shared(numpy.asarray(0., dtype=theano.config.floatX)) delta_t = timestep + 1 b1 = 0.9 # for ADAM b2 = 0.999 # for ADAM a = T.sqrt(1 - b2**delta_t) / (1 - b1**delta_t) # for ADAM # to avoid division by zero fudge_factor = 1e-7 if verbose >= 3: print "... Building backprop network." # This is copied straight from my old toolbox: Samosa. I hope this is working correctly. # There might be a better way to have written these... different methods for different # optimizers perhaps ? if verbose >= 3: print "... Applying " + self.optimizer_type print "... Applying " + self.momentum_type self.updates = OrderedDict() for velocity, gradient, acc_1, acc_2, param in zip( velocities, self.gradients, accumulator_1, accumulator_2, params): if self.optimizer_type == 'adagrad': """ Adagrad implemented from paper: John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. JMLR """ current_acc_1 = acc_1 + T.sqr(gradient) # Accumulates Gradient self.updates[ acc_1] = current_acc_1 # updates accumulation at timestamp elif self.optimizer_type == 'rmsprop': """ Tieleman, T. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)""" rms_rho = 0.9 current_acc_1 = rms_rho * acc_1 + (1 - rms_rho) * T.sqr(gradient) self.updates[acc_1] = current_acc_1 elif self.optimizer_type == 'sgd': current_acc_1 = 1. elif self.optimizer_type == 'adam': """ Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014).""" if not self.momentum_type == '_adam': if verbose >= 3 and not self.momentum_type == 'false': print "... ADAM doesn't need explicit momentum. Momentum is removed." self.momentum_type = '_adam' current_acc_2 = b1 * acc_2 + (1 - b1) * gradient current_acc_1 = b2 * acc_1 + (1 - b2) * T.sqr(gradient) self.updates[acc_2] = current_acc_2 self.updates[acc_1] = current_acc_1 if self.momentum_type == '_adam': self.updates[velocity] = a * current_acc_2 / ( T.sqrt(current_acc_1) + fudge_factor) elif self.momentum_type == 'false': # no momentum self.updates[velocity] = -(self.learning_rate / T.sqrt( current_acc_1 + fudge_factor)) * gradient elif self.momentum_type == 'polyak': # if polyak momentum """ Momentum implemented from paper: Polyak, Boris Teodorovich. "Some methods of speeding up the convergence of iteration methods." USSR Computational Mathematics and Mathematical Physics 4.5 (1964): 1-17. Adapted from Sutskever, Ilya, Hinton et al. "On the importance of initialization and momentum in deep learning.", Proceedings of the 30th international conference on machine learning (ICML-13). 2013. equation (1) and equation (2)""" self.updates[velocity] = self.momentum * velocity - (1.- self.momentum) * \ ( self.learning_rate / T.sqrt(current_acc_1 + fudge_factor)) \ * gradient elif self.momentum_type == 'nesterov': # Nestrov accelerated gradient """Nesterov, Yurii. "A method of solving a convex programming problem with convergence rate O (1/k2)." Soviet Mathematics Doklady. Vol. 27. No. 2. 1983. Adapted from https://blogs.princeton.edu/imabandit/2013/04/01/acceleratedgradientdescent/ Instead of using past params we use the current params as described in this link https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617,""" self.updates[velocity] = self.momentum * velocity - (1.-self.momentum) * \ ( self.learning_rate / T.sqrt(current_acc_1 + fudge_factor)) \ * gradient self.updates[param] = self.momentum * self.updates[velocity] else: if verbose >= 3: print "... Unrecognized mometum type, switching to no momentum." self.momentum_type = 'false' self.updates[velocity] = -(self.learning_rate / T.sqrt( current_acc_1 + fudge_factor)) * gradient stepped_param = param + self.updates[velocity] if self.momentum_type == 'nesterov': stepped_param = stepped_param + self.updates[param] column_norm = True #This I don't fully understand if #its needed after BN is implemented. # This is been around since my first ever # implementation of samosa, and I haven't tested it out. if param.get_value(borrow=True).ndim == 2 and column_norm is True: """ constrain the norms of the COLUMNs of the weight, according to https://github.com/BVLC/caffe/issues/109 """ col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(15)) scale = desired_norms / (fudge_factor + col_norms) self.updates[param] = stepped_param * scale else: self.updates[param] = stepped_param if self.optimizer_type == 'adam': self.updates[timestep] = delta_t
def GaussianNLL(y, mu, sig): nll = 0.5 * T.sum( T.sqr(y - mu) / sig**2 + 2 * T.log(sig) + T.log(2 * numpy.pi), axis=0) return nll
def get_local_cost(self): er = T.sqr(self.S - self.transformer.lmul(self.X)).sum() l1 = T.sqrt( T.sqr(self.X) + 1e-6).sum() top_down = self.get_top_down_flow() return er + .1 * l1 + top_down
cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = binary_net.DenseLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=10) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
def inner_fn(t, stm1, postm1, vtm1): # Use hidden state to generate action state aht = T.dot(Wa_aht_st, T.reshape(stm1, (n_s, n_proc))) + ba_aht #aht2 = T.dot(Wa_aht2_aht, T.reshape(aht,(n_s,n_proc))) + ba_aht2 #aht3 = T.dot(Wa_aht3_aht2, T.reshape(aht2,(n_s,n_proc))) + ba_aht3 atm1_mu = T.dot(Wa_atmu_aht, T.reshape(aht, (n_s, n_proc))) + ba_atmu atm1_sig = T.nnet.softplus( T.dot(Wa_atsig_aht, T.reshape(aht, (n_s, n_proc))) + ba_atsig) + sig_min_action # Sample Action atm1 = atm1_mu + theano_rng.normal((n_oa, n_proc)) * atm1_sig # Update Environment action_force = T.tanh(atm1) force = T.switch( T.lt(postm1, 0.0), -2 * postm1 - 1, -T.pow(1 + 5 * T.sqr(postm1), -0.5) - T.sqr(postm1) * T.pow(1 + 5 * T.sqr(postm1), -1.5) - T.pow(postm1, 4) / 16.0) - 0.25 * vtm1 vt = vtm1 + 0.05 * force + 0.03 * action_force post = postm1 + vt # Generate Sensory Inputs: # 1.) Observation of Last Action oat = atm1 # 2.) Noisy Observation of Current Position ot = post + theano_rng.normal((n_o, n_proc)) * 0.01 # 3.) Nonlinear Transformed Sensory Channel oht = T.exp(-T.sqr(post - 1.0) / 2.0 / 0.3 / 0.3) # Infer hidden state from last hidden state and current observations, using variational density hst = T.nnet.relu( T.dot(Wq_hst_stm1, T.reshape(stm1, (n_s, n_proc))) + T.dot(Wq_hst_ot, T.reshape(ot, (n_o, n_proc))) + T.dot(Wq_hst_oht, T.reshape(oht, (n_oh, n_proc))) + T.dot(Wq_hst_oat, T.reshape(oat, (n_oa, n_proc))) + bq_hst) hst2 = T.nnet.relu( T.dot(Wq_hst2_hst, T.reshape(hst, (n_s, n_proc))) + bq_hst2) stmu = T.tanh( T.dot(Wq_stmu_hst2, T.reshape(hst2, (n_s, n_proc))) + bq_stmu) stsig = T.nnet.softplus( T.dot(Wq_stsig_hst2, T.reshape(hst2, (n_s, n_proc))) + bq_stsig) + sig_min_states # Explicitly encode position as homeostatic state variable # Rescale representation to fit within linear response of the tanh-nonlinearity stmu = T.set_subtensor(stmu[0, :], 0.1 * ot[0, :]).reshape((n_s, n_proc)) stsig = T.set_subtensor(stsig[0, :], 0.005).reshape((n_s, n_proc)) # Sample from variational density st = stmu + theano_rng.normal((n_s, n_proc)) * stsig # Calculate parameters of likelihood distributions from sampled state ost = T.nnet.relu(T.dot(Wl_ost_st, T.reshape(st, (n_s, n_proc))) + bl_ost) ost2 = T.nnet.relu( T.dot(Wl_ost2_ost, T.reshape(ost, (n_s, n_proc))) + bl_ost2) ost3 = T.nnet.relu( T.dot(Wl_ost3_ost2, T.reshape(ost2, (n_s, n_proc))) + bl_ost3) otmu = T.dot(Wl_otmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_otmu otsig = T.nnet.softplus( T.dot(Wl_otsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_otsig) + sig_min_obs ohtmu = T.dot(Wl_ohtmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_ohtmu ohtsig = T.nnet.softplus( T.dot(Wl_ohtsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_ohtsig) + sig_min_obs oatmu = T.dot(Wl_oatmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_oatmu oatsig = T.nnet.softplus( T.dot(Wl_oatsig_st, T.reshape(ost3, (n_s, n_proc))) + bl_oatsig) + sig_min_obs # Calculate negative log-likelihood of observations p_ot = GaussianNLL(ot, otmu, otsig) p_oht = GaussianNLL(oht, ohtmu, ohtsig) p_oat = GaussianNLL(oat, oatmu, oatsig) # Calculate prior expectation on hidden state from previous state prior_stmu = T.tanh( T.dot(Wl_stmu_stm1, T.reshape(stm1, (n_s, n_proc))) + bl_stmu) prior_stsig = T.nnet.softplus( T.dot(Wl_stsig_stm1, T.reshape(stm1, (n_s, n_proc))) + bl_stsig) + sig_min_states # Explicitly encode expectations on homeostatic state variable prior_stmu = ifelse(T.lt(t, 20), prior_stmu, T.set_subtensor(prior_stmu[0, :], 0.1)) prior_stsig = ifelse(T.lt(t, 20), prior_stsig, T.set_subtensor(prior_stsig[0, :], 0.005)) # Calculate KL divergence between variational density and prior density # using explicit formula for diagonal gaussians KL_st = KLGaussianGaussian(stmu, stsig, prior_stmu, prior_stsig) # Put free energy functional together FEt = KL_st + p_ot + p_oht + p_oat return st, post, vt, oat, ot, oht, FEt, KL_st, stmu, stsig, force, p_ot, p_oht, p_oat
def __init__(self, numpy_rng, theano_rng=None, cfg = None, # the network configuration dnn_shared = None, shared_layers=[], input = None): self.layers = [] self.params = [] self.delta_params = [] self.rnn_layerX = 2 print "Use DRN 2" self.cfg = cfg self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg self.non_updated_layers = cfg.non_updated_layers if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.matrix('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None; b = None if (i in shared_layers) : W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b if i == self.rnn_layerX: hidden_layer = RnnLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) else: if self.do_maxout == True: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W = W, b = b, activation = (lambda x: 1.0*x), do_maxout = True, pool_size = self.pool_size) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W = W, b = b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated if (i not in self.non_updated_layers): self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = OutputLayer( input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.l2(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def RNN(): # First, we build the network, for first sentence starting with an input layer # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) #Giving the batch size as None because we are still experimenting with the # meaning and true usage of the parameter #Sequence length corresponds to time steps but this would be variable and it would depend #upon the input length so lets give it as None #Number of features are 300 because each word is a vector of 300 dimensions W = lasagne.init.HeUniform() l_in_1 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH, N_FEATURES)) l_mask_1 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH)) l_forward_1 = lasagne.layers.RecurrentLayer( l_in_1, N_HIDDEN, mask_input=l_mask_1, grad_clipping=GRAD_CLIP, W_in_to_hid=W, W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh) l_forward_2_1 = lasagne.layers.RecurrentLayer( l_forward_1, N_HIDDEN, mask_input=l_mask_1, grad_clipping=GRAD_CLIP, W_in_to_hid=W, W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh) # l_forward_3_1 = lasagne.layers.RecurrentLayer( # l_forward_2_1, N_HIDDEN, mask_input=l_mask_1, grad_clipping=GRAD_CLIP, # W_in_to_hid=W, # W_hid_to_hid=lasagne.init.HeUniform(), # nonlinearity=lasagne.nonlinearities.tanh) l_out_1 = lasagne.layers.SliceLayer(l_forward_2_1, -1, 1) #l_out_1 = lasagne.layers.DenseLayer(l_forward_1, num_units=n_output) l_in_2 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH, N_FEATURES)) l_mask_2 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH)) l_forward_2 = lasagne.layers.RecurrentLayer( l_in_2, N_HIDDEN, mask_input=l_mask_2, grad_clipping=GRAD_CLIP, W_in_to_hid=lasagne.init.HeUniform(), W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh) l_forward_2_2 = lasagne.layers.RecurrentLayer( l_forward_2, N_HIDDEN, mask_input=l_mask_2, grad_clipping=GRAD_CLIP, W_in_to_hid=lasagne.init.HeUniform(), W_hid_to_hid=lasagne.init.HeUniform(), nonlinearity=lasagne.nonlinearities.tanh) # l_forward_3_2 = lasagne.layers.RecurrentLayer( # l_forward_2_2, N_HIDDEN, mask_input=l_mask_2, grad_clipping=GRAD_CLIP, # W_in_to_hid=lasagne.init.HeUniform(), # W_hid_to_hid=lasagne.init.HeUniform(), # nonlinearity=lasagne.nonlinearities.tanh) l_out_2 = lasagne.layers.SliceLayer(l_forward_2_2, -1, 1) #l_out_2 = lasagne.layers.DenseLayer(l_forward_2, num_units=n_output) #target cosine similarity of the pair of sentence target_values = T.vector('target_output') network_output_1 = lasagne.layers.get_output(l_out_1) #network_output_1 = lasagne.layers.get_output(l_out_1) network_output_2 = lasagne.layers.get_output(l_out_2) #network_output_2 = lasagne.layers.get_output(l_out_2) mod_y_1 = T.sqrt(T.sum(T.sqr(network_output_1), 1)) mod_y_2 = T.sqrt(T.sum(T.sqr(network_output_2), 1)) cosine_simi = T.sum(network_output_1 * network_output_2, axis=1) / (mod_y_1 * mod_y_2) cost = T.mean((cosine_simi - target_values)**2) # cosine_sim = T.sum(network_output_1*network_output_2,axis = 1) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params( l_out_1) + lasagne.layers.get_all_params(l_out_2) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var, l_mask_2.input_var ], cost, updates=updates, on_unused_input='warn') # compute_cost = theano.function( # [l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var, # l_mask_2.input_var], cost, on_unused_input='warn') # test_cosine = theano.function([ l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var, l_mask_2.input_var ], cosine_simi, on_unused_input='warn') train_sentence_1, train_sentence_2, cosineSimtrain, mask_train_1, mask_train_2 \ ,test_sentence_undergoer_1, test_sentence_undergoer_2 \ ,cosineSimUndergoer, mask_undergoer_test_1, mask_undergoer_test_2 \ ,test_sentence_trigger_1, test_sentence_trigger_2 \ ,cosineSimTrigger, mask_trigger_test_1, mask_trigger_test_2\ ,test_sentence_enabler_1, test_sentence_enabler_2, \ cosineSimEnabler, mask_enabler_test_1, mask_enabler_test_2\ ,test_sentence_result_1, test_sentence_result_2, \ cosineSimResult, mask_result_test_1, mask_result_test_2,\ test_df = gen_csvdata() print("Training ...") try: for epoch in range(num_epochs): cost_val = train(train_sentence_1, train_sentence_2, cosineSimtrain, mask_train_1, mask_train_2) #cost_val = compute_cost(train_sentence_1, train_sentence_2, cosineSimtrain, mask_train_1,mask_train_2 ) print("Epoch {} validation cost = {}".format(epoch, cost_val)) if epoch % 100 == 0: cosine_undergoersim = test_cosine(test_sentence_undergoer_1, test_sentence_undergoer_2,\ cosineSimUndergoer, mask_undergoer_test_1, mask_undergoer_test_2) cosine_enablersim = test_cosine(test_sentence_enabler_1, test_sentence_enabler_2,\ cosineSimEnabler, mask_enabler_test_1, mask_enabler_test_2) cosine_triggersim = test_cosine(test_sentence_trigger_1, test_sentence_trigger_2,\ cosineSimTrigger, mask_trigger_test_1, mask_trigger_test_2) cosine_resultsim = test_cosine(test_sentence_result_1, test_sentence_result_2,\ cosineSimResult, mask_result_test_1, mask_result_test_2) test_df["newUndergoerScore"] = cosine_undergoersim test_df["newEnablerScore"] = cosine_enablersim test_df["newTriggerScore"] = cosine_triggersim test_df["newResultScore"] = cosine_resultsim test_df["avgOurScore"] = test_df.apply(averageFinalScore, axis=1) directory = "newresult/prediction/deep2RNNphys_wordvec200/" + str( epoch) if not os.path.exists(directory): os.makedirs(directory) test_df.to_csv(directory + "/cosineSimilarity.csv") except KeyboardInterrupt: pass
def squared_errors(self,y): """ Returns the mean of squared errors of the linear regression on this data. """ #return (T.mean(T.sqr(self.y_pred - y),axis=0)) # return T.mean(T.sqr(self.y_pred - y),axis=1) return T.mean(T.sqr(self.y_pred - y))
def init_updates(self): gparams = T.grad(self.cost, self.params, disconnected_inputs='warn') # Remove NaNs if self.nan_protection: gparams = [ T.switch(T.isnan(g) or T.isinf(g), 0., g) for g in gparams ] # gradient clipping (grad_norm) if self.grad_norm_clip is not None: grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) grad_norm = T.sqrt(grad_norm) scale = self.grad_norm_clip / T.maximum(self.grad_norm_clip, grad_norm) gparams = [g * scale for g in gparams] # Gradient clipping (hard) if self.grad_clip is not None: gparams = [T.minimum(g, self.grad_clip) for g in gparams] gparams = [T.maximum(g, -1. * self.grad_clip) for g in gparams] lr = defaultdict(lambda *args: self.lr) try: lr.update(dict(self.notifier.notify(Notifier.LEARNING_RATES))) except Exception: pass mult = defaultdict(lambda *args: np.cast[fx](1)) try: mult.update(dict(self.notifier.notify(Notifier.PARAM_MULT))) except Exception: pass mom = defaultdict(lambda *args: self.mom) try: mom.update(dict(self.notifier.notify(Notifier.MOMENTUM))) except Exception: pass # Parameter updates updates_param = [ (param, param * mult[param.name] - \ lr[param.name] * ((1 - mom[param.name]) * gparam_cur + mom[param.name] * gparam_last)) for param, gparam_cur, gparam_last in zip(self.params, gparams, self.gparams) ] # gradient updates for momentum updates_gparam = [ (gparam_last, gparam_cur) for gparam_last, gparam_cur in zip(self.gparams, gparams) ] updates = updates_param + updates_gparam # Callback to an external function. E.g. there are non-integrable nodes # which should be added after the gradient calculation. if self.notifier is not None: if len(self.notifier.callbacks[Notifier.GRADIENT_CALCULATED]) > 0: grads_new = self.notifier.notify(Notifier.GRADIENT_CALCULATED, updates) if grads_new is not None and len(grads_new) > 0: updates = np.vstack(grads_new) # ensure that the broadcastpattern before and after the update is identical updates = [(k, T.patternbroadcast(v, k.broadcastable)) for k, v in updates] validate = self.validate if self.validate is not None else self.cost return theano.function(inputs=self.variables.values(), outputs=validate, updates=updates, allow_input_downcast=True, on_unused_input='warn')
def fn(images): return T.sum(T.sqr(images2neibs(images, (2, 2), mode='ignore_borders')), axis=[0, 1])
def reg_mse(self, y): """ Returns the mean of squared errors of the linear regression with l1 and l2 regularization on this data. """ L1 = T.sum(abs(self.y_pred - y)) L2_sqr = T.sum((self.y_pred - y)**2) return T.mean(T.sqr(self.y_pred - y)) + 0.01*L1 + 0.01*L2_sqr