def momentum_normscaled(loss, all_params, lr, mom, batch_size, max_norm=np.inf, weight_decay=0.0,verbose=False): updates = [] #all_grads = [theano.grad(loss, param) for param in all_params] all_grads = theano.grad(gradient_clipper(loss),all_params) grad_lst = [ T.sum( ( grad / float(batch_size) )**2 ) for grad in all_grads ] grad_norm = T.sqrt( T.sum( grad_lst )) if verbose: grad_norm = theano.printing.Print('MOMENTUM GRAD NORM1:')(grad_norm) all_grads = ifelse(T.gt(grad_norm, max_norm), [grads*(max_norm / grad_norm) for grads in all_grads], all_grads) if verbose: grad_lst = [ T.sum( ( grad / float(batch_size) )**2 ) for grad in all_grads ] grad_norm = T.sqrt( T.sum( grad_lst )) grad_norm = theano.printing.Print('MOMENTUM GRAD NORM2:')(grad_norm) all_grads = ifelse(T.gt(grad_norm, np.inf), [grads*(max_norm / grad_norm) for grads in all_grads], all_grads) for param_i, grad_i in zip(all_params, all_grads): mparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) v = mom * mparam_i - lr*(weight_decay*param_i + grad_i) updates.append( (mparam_i, v) ) updates.append( (param_i, param_i + v) ) return updates
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def __init__(self, alpha, m, *args, **kwargs): super(Pareto, self).__init__(*args, **kwargs) self.alpha = alpha self.m = m self.mean = tt.switch(tt.gt(alpha, 1), alpha * m / (alpha - 1.0), np.inf) self.median = m * 2.0 ** (1.0 / alpha) self.variance = tt.switch(tt.gt(alpha, 2), (alpha * m ** 2) / ((alpha - 2.0) * (alpha - 1.0) ** 2), np.inf)
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1*numpy.ones(len(params)) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 50. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0) : delta = T.minimum(delta * positiveStep, maxStep) if T.lt(delta, minStep): delta = minStep elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) if T.gt(delta, params['maxStep']): delta = params['maxStep'] last_gparam = 0 # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def decay(self): updates = [] new_batch = ifelse(T.gt(self.batch, self.decay_batch), sharedX(0), self.batch+1) new_lr = ifelse(T.gt(self.batch, self.decay_batch), self.lr*self.lr_decay_factor, self.lr) updates.append((self.batch, new_batch)) updates.append((self.lr, new_lr)) return updates
def T_subspacel1_slow_shrinkage(a,L,lam_sparse,lam_slow,small_value=.001): amp = T.sqrt(a[::2,:]**2 + a[1::2,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[:,1:] - amp[:,:-1] d2 = d1[:,1:] - d1[:,:-1] div = T.set_subtensor(div[:,1:-1],-d2) div = T.set_subtensor(div[:,0], -d1[:,0]) div = T.set_subtensor(div[:,-1], d1[:,-1]) slow_amp_shrinkage = 1 - (lam_slow/L)*(div/amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage,0),slow_amp_shrinkage,0) slow_shrinkage_prox_a = slow_amp_value*a[::2,:] slow_shrinkage_prox_b = slow_amp_value*a[1::2,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a**2 + slow_shrinkage_prox_b**2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse/L)/amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage,0.),amp_shrinkage,0.) subspacel1_prox = T.zeros_like(a) subspacel1_prox = T.set_subtensor(subspacel1_prox[ ::2,:],amp_value*slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[1::2,:],amp_value*slow_shrinkage_prox_b) return subspacel1_prox
def __init__(self, x, lower, upper, *args, **kwargs): super(Uniform, self).__init__(*args, **kwargs) self._logp = T.log(T.switch(T.gt(x, upper), 0, T.switch(T.lt(x, lower), 0, 1/(upper - lower)))) self._cdf = T.switch(T.gt(x, up), 1, T.switch(T.lt(x, low), 0, (x - low)/(up - low))) self._add_expr('x', x) self._add_expr('lower', lower) self._add_expr('upper', upper)
def errors(self, y, print_output=False): # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): num_positive = T.cast(T.sum(T.eq(y,1)),'float64') num_predicted_positive = T.cast(T.sum(T.eq(self.y_pred,1)),'float64') num_correctly_predicted = T.cast(T.sum(T.eq(self.y_pred*y,1)),'float64') P = T.cast(0.0,'float64') # precision = True positive / (True positive + False positive) if (T.gt(num_predicted_positive,0.0)): P = T.cast(num_correctly_predicted / num_predicted_positive,'float64') R = T.cast(0.0,'float64') # recall = True positive / (True positive + False negative) if (T.gt(num_positive,0.0)): R = T.cast(num_correctly_predicted / num_positive,'float64') F1 = T.cast(0.0,'float64') # F1 score if (T.gt(P+R,0.0)): F1 = 2.0*P*R/(P+R) if (print_output): print(" num positive = {0}".format( num_positive ) ) print(" num predicted positive = {0}".format( num_predicted_positive ) ) print(" num correctly predicted = {0}".format( num_correctly_predicted ) ) print(" precision = {0}".format(P)) print(" recall = {0}".format(R)) print(" F1 score = {0}".format(F1)) return [T.mean(T.neq(self.y_pred, y)), P, R, F1] else: raise NotImplementedError() return
def call(self, X): if type(X) is not list or len(X) != 2: raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X)) frame, position = X[0], X[1] # Reshaping the input to exclude the time dimension frameShape = K.shape(frame) positionShape = K.shape(position) (chans, height, width) = frameShape[-3:] targetDim = positionShape[-1] frame = K.reshape(frame, (-1, chans, height, width)) position = K.reshape(position, (-1, ) + (targetDim, )) # Applying the attention hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0 hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0 position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0) position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0) rX = Data.linspace(-1.0, 1.0, width) rY = Data.linspace(-1.0, 1.0, height) FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x')) FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x')) m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1) m = m + self.alpha - THT.gt(m, 0.) * self.alpha frame = frame * m.dimshuffle(0, 'x', 1, 2) # Reshaping the frame to include time dimension output = K.reshape(frame, frameShape) return output
def multiclassRealPosAndNegAndTruePredPosNegTraining0OrValidation1(self, y, training0OrValidation1): """ The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). For class_i == 0 (backgr), what is reported is the WHOLE rp,rn,tpp,tpn. ie, as calculated considering background VS all other classes. Order in the list is the natural order of the classes (ie class-0-WHOLE RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) """ returnedListWithNumberOfRpRnPpPnForEachClass = [] for class_i in xrange(0, self.numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). vectorOneAtRealPositives = T.gt(y, 0) if class_i == 0 else T.eq(y, class_i) vectorOneAtRealNegatives = T.eq(y, 0) if class_i == 0 else T.neq(y, class_i) if training0OrValidation1 == 0 : #training: yPredToUse = self.y_pred else: #validation yPredToUse = self.y_pred_inference vectorOneAtPredictedPositives = T.gt(yPredToUse, 0) if class_i == 0 else T.eq(yPredToUse, class_i) vectorOneAtPredictedNegatives = T.eq(yPredToUse, 0) if class_i == 0 else T.neq(yPredToUse, class_i) vectorOneAtTruePredictedPositives = T.and_(vectorOneAtRealPositives,vectorOneAtPredictedPositives) vectorOneAtTruePredictedNegatives = T.and_(vectorOneAtRealNegatives,vectorOneAtPredictedNegatives) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealNegatives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedNegatives) ) return returnedListWithNumberOfRpRnPpPnForEachClass
def norm_col(w, h): """normalize the column vector w (Theano function). Apply the invert normalization on h such that w.h does not change Parameters ---------- w: Theano vector vector to be normalised h: Ttheano vector vector to be normalised by the invert normalistation Returns ------- w : Theano vector with the same shape as w normalised vector (w/norm) h : Theano vector with the same shape as h h*norm """ norm = w.norm(2, 0) eps = 1e-12 size_norm = (T.ones_like(w)).norm(2, 0) w = ifelse(T.gt(norm, eps), w/norm, (w+eps)/(eps*size_norm).astype(theano.config.floatX)) h = ifelse(T.gt(norm, eps), h*norm, (h*eps*size_norm).astype(theano.config.floatX)) return w, h
def calcColNormalizer(inMatrix): #Theano function for calculating logSum, i.e., calculate ln(X + Y) based on ln(X) and ln(Y). maxExp = -4950.0 x, y = T.fscalars(2) yMinusx = y - x ## this part is for the condition which x > y xMinusy = x - y # if x < y bigger = T.switch(T.gt(x, y), x, y) YSubtractX = T.switch(T.gt(x,y), yMinusx, xMinusy) x_prime = T.log(1 + T.exp(YSubtractX)) + bigger calcSum = T.switch(T.lt(YSubtractX, maxExp), bigger, x_prime) logSum = function([x, y], calcSum, allow_input_downcast=True) ####### end of logSum ############### # now we caclculate sum of log joint as normalizer if len(inMatrix.shape) < 2: raise Exception ("calcColNormalizer expect a 2D matrix") nRows, nCols = inMatrix.shape columnAccumLogSum = np.zeros(nCols) for col in range(nCols): currLogSum = np.NINF for j in range(nRows): if inMatrix[j,col] == np.NINF: continue currLogSum = logSum(currLogSum, inMatrix[j,col]) columnAccumLogSum[col] = currLogSum return columnAccumLogSum
def _backward_negative_z(inputs, weights, normed_relevances, bias=None): inputs_plus = inputs * T.gt(inputs, 0) weights_plus = weights * T.gt(weights, 0) inputs_minus = inputs * T.lt(inputs, 0) weights_minus = weights * T.lt(weights, 0) # Compute weights+ * inputs- and weights- * inputs+ negative_part_a = conv2d( normed_relevances, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) negative_part_a *= inputs_minus negative_part_b = conv2d( normed_relevances, weights_minus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) negative_part_b *= inputs_plus together = negative_part_a + negative_part_b if bias is not None: bias_negative = bias * T.lt(bias, 0) bias_relevance = bias_negative.dimshuffle("x", 0, "x", "x") * normed_relevances # Divide bias by weight size before convolving back # mean across channel, 0, 1 dims (hope this is correct?) fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(theano.config.floatX) bias_rel_in = conv2d( fraction_bias, T.ones_like(weights).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full" ) together += bias_rel_in return together
def group_div(X, W, H, beta, params): """Compute beta divergence D(X|WH), intra-class distance and intra-session distance for a particular (class, session) couple [1]_. Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar params : Theano tensor Matrix of parameter related to class/session. :params[0][0]: index for the (class, session) couple :params[1][0]: number of vector basis related to class :params[1][1]: number of vector basis related to session :params[2]: weight on the class/session similarity constraints :params[3]: sessions in which class c appears :params[4]: classes present in session s Returns ------- cost : Theano scalar total cost div : Theano scalar beta divergence D(X|WH) sum_cls : Theano scalar intra-class distance sum_ses : Theano scalar intra-session distance""" ind = params[0][0] k_cls = params[1][0] k_ses = params[1][1] lambdas = params[2] Sc = params[3] Cs = params[4] res_ses, up = theano.scan( fn=lambda Cs, prior_result: prior_result + eucl_dist(W[ind, :, k_cls : k_cls + k_ses], W[Cs, :, k_cls : k_cls + k_ses]), outputs_info=T.zeros_like(beta), sequences=Cs, ) sum_ses = ifelse(T.gt(Cs[0], 0), res_ses[-1], T.zeros_like(beta)) res_cls, up = theano.scan( fn=lambda Sc, prior_result: prior_result + eucl_dist(W[ind, :, 0:k_cls], W[Sc, :, 0:k_cls]), outputs_info=T.zeros_like(beta), sequences=Sc, ) sum_cls = ifelse(T.gt(Sc[0], 0), res_cls[-1], T.zeros_like(beta)) betaDiv = beta_div(X, W[ind].T, H, beta) cost = lambdas[0] * sum_cls + lambdas[1] * sum_ses + betaDiv return cost, betaDiv, sum_cls, sum_ses
def symGivens2(a, b): """ Stable Symmetric Givens rotation plus reflection Parameters a: (theano scalar) first element of a two-vector [a; b] b: (theano scalar) second element of a two-vector [a; b] Returns c cosine(theta), where theta is the implicit angle of rotation (counter-clockwise) in a plane-rotation s sine(theta) d two-norm of [a; b] Description: This method gives c and s such that [ c s ][a] = [d], [ s -c ][b] [0] where d = two norm of vector [a, b], c = a / sqrt(a^2 + b^2) = a / d, s = b / sqrt(a^2 + b^2) = b / d. The implementation guards against overflow in computing sqrt(a^2 + b^2). SEE ALSO: (1) Algorithm 4.9, stable *unsymmetric* Givens rotations in Golub and van Loan's book Matrix Computations, 3rd edition. (2) MATLAB's function PLANEROT. Observations: Implementing this function as a single op in C might improve speed considerably .. """ c_branch1 = T.switch(T.eq(a, constantX(0)), constantX(1), T.sgn(a)) c_branch21 = (a / b) * T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2) c_branch22 = T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2) c_branch2 = T.switch(T.eq(a, constantX(0)), constantX(0), T.switch(T.gt(abs(b), abs(a)), c_branch21, c_branch22)) c = T.switch(T.eq(b, constantX(0)), c_branch1, c_branch2) s_branch1 = T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2) s_branch2 = (b / a) * T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2) s = T.switch( T.eq(b, constantX(0)), constantX(0), T.switch(T.eq(a, constantX(0)), T.sgn(b), T.switch(T.gt(abs(b), abs(a)), s_branch1, s_branch2)), ) d_branch1 = b / (T.sgn(b) / T.sqrt(constantX(1) + (a / b) ** 2)) d_branch2 = a / (T.sgn(a) / T.sqrt(constantX(1) + (b / a) ** 2)) d = T.switch( T.eq(b, constantX(0)), abs(a), T.switch(T.eq(a, constantX(0)), abs(b), T.switch(T.gt(abs(b), abs(a)), d_branch1, d_branch2)), ) return c, s, d
def __call__(self, input): mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) + self.epsilon # Don't batchnoramlise a single data point mean = ifelse(T.gt(input.shape[0], 1), mean, T.zeros(mean.shape, dtype=mean.dtype)) std = ifelse(T.gt(input.shape[0], 1), std, T.ones(std.shape, dtype=std.dtype)) return (input - mean) * T.addbroadcast((self.gamma / std) + self.beta, *self.axes)
def objective(y_true, y_pred, P, Q, alpha=0., beta=0.15, dbeta=0., gamma=0.01, gamma1=-1., poos=0.23, eps=1e-6): '''Expects a binary class matrix instead of a vector of scalar classes. ''' beta = np.float32(beta) dbeta = np.float32(dbeta) gamma = np.float32(gamma) poos = np.float32(poos) eps = np.float32(eps) # scale preds so that the class probas of each sample sum to 1 y_pred += eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y_true = T.cast(y_true.flatten(), 'int64') y1 = T.and_(T.gt(y_true, 0), T.le(y_true, Q)) # in-set y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos y1sum = y1.sum() + eps # number of in-set # we want to reduce cross entrophy of labeled data # convert all oos/unlabeled to label=0 cost0 = T.nnet.categorical_crossentropy(y_pred, T.switch(y_true <= Q, y_true, 0)) cost0 = T.dot(y1, cost0) / y1sum # average cost per labeled example if alpha: cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example cost0 += alpha*cost1 # we want to increase the average entrophy in each batch # average over batch if beta: y_pred_avg0 = T.dot(y0, y_pred) / y0sum y_pred_avg0 = T.clip(y_pred_avg0, eps, np.float32(1) - eps) y_pred_avg0 /= y_pred_avg0.sum(axis=-1, keepdims=True) cost2 = T.nnet.categorical_crossentropy(y_pred_avg0.reshape((1,-1)), P-dbeta)[0] # [None,:] cost2 = T.switch(y0sum > 0.5, cost2, 0.) # ignore cost2 if no samples cost0 += beta*cost2 # binary classifier score if gamma: y_pred0 = T.clip(y_pred[:,0], eps, np.float32(1) - eps) if gamma1 < 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot(np.float32(1)-poos*y0.T,T.log(np.float32(1)-y_pred0)) cost3 /= y_pred.shape[0] cost0 += gamma*cost3 elif gamma1 > 0.: cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0,T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost31 = - T.dot(y1,T.log(np.float32(1)-y_pred0)) cost3 /= y1sum cost0 += gamma*cost3 + gamma1*cost31 else: # gamma1 == 0. cost3 = - T.dot(poos*y0,T.log(y_pred0)) - T.dot((np.float32(1)-poos)*y0, T.log(np.float32(1)-y_pred0)) cost3 /= y0sum cost0 += gamma*cost3 return cost0
def new_range(overflow, overflow_1, max_overflow): # the goal is to update the range of the vector # we know the overflow rates associated with range (overflow) # and range-1 (overflow_1) # if (overflow > max_overflow): increment range # else if (overflow_1 < max_overflow): decrement range return T.switch(T.gt(overflow, max_overflow), 1, T.switch(T.gt(overflow_1, max_overflow), 0, - 1))
def rprop(param,learning_rate,gparam,mask,updates,current_cost,previous_cost, eta_plus=1.2,eta_minus=0.5,max_delta=50, min_delta=10e-6): previous_grad = sharedX(numpy.ones(param.shape.eval()),borrow=True) delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),borrow=True) previous_inc = sharedX(numpy.zeros(param.shape.eval()),borrow=True) zero = T.zeros_like(param) one = T.ones_like(param) change = previous_grad * gparam new_delta = T.clip( T.switch( T.eq(gparam,0.), delta, T.switch( T.gt(change,0.), delta*eta_plus, T.switch( T.lt(change,0.), delta*eta_minus, delta ) ) ), min_delta, max_delta ) new_previous_grad = T.switch( T.eq(mask * gparam,0.), previous_grad, T.switch( T.gt(change,0.), gparam, T.switch( T.lt(change,0.), zero, gparam ) ) ) inc = T.switch( T.eq(mask * gparam,0.), zero, T.switch( T.gt(change,0.), - T.sgn(gparam) * new_delta, T.switch( T.lt(change,0.), zero, - T.sgn(gparam) * new_delta ) ) ) updates.append((previous_grad,new_previous_grad)) updates.append((delta,new_delta)) updates.append((previous_inc,inc)) return param + inc * mask
def tukey_biweight(predictions, targets, c=4.685, s=1.4826): """ Tukey's biweight function expressed in theano as in :param predictions: Prediction tensor :param targets: Target tensor :param c: Tukey tuning constant :param s: Consistence scale parameter :return: Cost function """ # Flatten input to make calc easier pred = predictions.flatten(2) target = targets.flatten(2) # Compute mask mask = T.gt(target, 0) # Compute n of valid pixels n_valid = T.sum(mask, axis=1) # Apply mask and log transform m_pred = pred * mask m_t = T.switch(mask, target, 0) def median(tensor): """ MAD tensor from https://groups.google.com/forum/#!topic/theano-users/I4eHjbAetEQ :param tensor: Input tensor :return: Median expression """ tensor = tensor.flatten(1) return T.switch(T.eq((tensor.shape[0] % 2), 0), # if even vector T.mean(T.sort(tensor)[((tensor.shape[0] / 2) - 1): ((tensor.shape[0] / 2) + 1)]), # if odd vector T.sort(tensor)[tensor.shape[0] // 2]) def mad(tensor): """ Median absolute deviation :param tensor: Input tensor :return: MAD """ med = median(tensor=tensor) return median(T.abs_(tensor - med)) # Residual r_i = (m_pred - m_t) # r_i = r_i / (s * mad(r_i)) r_i = r_i / r_i.std() # Compute the masking vectors tukey_mask = T.gt(T.abs_(r_i), c) # Cost cost = (c ** 2 / 6) * (1-(1 - (r_i / c) ** 2) ** 3) # Aggregate return T.sum(T.sum(T.switch(tukey_mask, (c ** 2) / 6., cost), axis=1)) / T.maximum((T.sum(n_valid)), 1)
def rebuild(self): for i, (inputs, f) in enumerate(self.wiring): if not inputs: continue lin_comb = T.dot(T.concatenate([self._vlayers[j] for j in inputs], axis=1), self._vweights[i]) add_biases = lin_comb + self._vbiases[i] self._vlayers[i] = f(add_biases) self._output = T.concatenate([self._vlayers[j] for j in self.output_layers], axis=1) self._targets = [T.matrix() for j in self.output_layers] crossentropy = sum([(T.nnet.categorical_crossentropy(self._vlayers[j], self._targets[i]) if self.wiring[j][1] == SOFTMAX_FUN else ((self._vlayers[j] - self._targets[i]) ** 2 / (1+self._targets[i].max())**2).sum()) for i, j in enumerate(self.output_layers) ]) self._cost = (crossentropy.sum() + self.L2REG/(self.layers[i]) * sum((weight**2).sum() for weight in self._vweights if weight is not None)+ # + # L2 regularization 0.01* self.L2REG/math.sqrt(self.layers[i]) * sum((bias**2).sum() for j, bias in enumerate(self._vbiases) if bias is not None and self.wiring[j][1] != LINEAR_FUN)) # L2 regularization self._costnoreg = crossentropy.sum() self._derivatives = [None] * len(self.layers) self._updates = [] MAX_DERIV = 1000 for i, (inputs, f) in enumerate(self.wiring): if not inputs: continue deriv1 = T.grad(self._cost, self._vweights[i]) deriv1p = T.switch(T.lt(deriv1, MAX_DERIV), deriv1, MAX_DERIV) deriv1pp = T.switch(T.gt(deriv1p, -MAX_DERIV), deriv1p, -MAX_DERIV) #deriv1ppp = T.switch(T.isnan(deriv1pp), 0, deriv1pp) deriv2 = T.grad(self._cost, self._vbiases[i]) deriv2p = T.switch(T.lt(deriv2, MAX_DERIV), deriv2, MAX_DERIV) deriv2pp = T.switch(T.gt(deriv2p, -MAX_DERIV), deriv2p, -MAX_DERIV) #deriv2ppp = T.switch(T.isnan(deriv2pp), 0, deriv2pp) self._derivatives[i] = (deriv1pp, deriv2pp) self._updates.append((self._vweights[i], self._vweights[i] - self.learning_rate * self._derivatives[i][0])) self._updates.append((self._vbiases[i], self._vbiases[i] - self.learning_rate * self._derivatives[i][1])) self._prediction = theano.function(inputs=[self._vlayers[i] for i in self.input_layers], outputs=self._output) self._train = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers], outputs=self._cost, updates=self._updates, allow_input_downcast=True) #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) # debug NaN self._costfun = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers], outputs=self._costnoreg, allow_input_downcast=True)
def _forward_negative_z(inputs, weights, bias=None): inputs_plus = inputs * T.gt(inputs, 0) weights_plus = weights * T.gt(weights, 0) inputs_minus = inputs * T.lt(inputs, 0) weights_minus = weights * T.lt(weights, 0) negative_part_a = conv2d(inputs_plus, weights_minus) negative_part_b = conv2d(inputs_minus, weights_plus) together = negative_part_a + negative_part_b if bias is not None: bias_negative = bias * T.lt(bias, 0) together += bias_negative.dimshuffle("x", 0, "x", "x") return together
def relevance_conv_a_b_sign_switch(inputs, weights, out_relevances, a, b, bias=None): assert a is not None assert b is not None assert a - b == 1 # For each input, determine what outputs = conv2d(inputs, weights) if bias is not None: outputs += bias.dimshuffle("x", 0, "x", "x") # do not use bias further, only to determine direction of outputs bias = None # stabilize # prevent division by 0 and division by small numbers eps = 1e-4 outputs += T.sgn(outputs) * eps outputs += T.eq(outputs, 0) * eps positive_forward = _forward_positive_z(inputs, weights, bias) negative_forward = _forward_negative_z(inputs, weights, bias) rel_for_positive_outputs = out_relevances * T.gt(outputs, 0) rel_for_negative_outputs = out_relevances * T.lt(outputs, 0) positive_norm_with_trend = positive_forward * T.gt(outputs, 0) negative_norm_with_trend = negative_forward * T.lt(outputs, 0) # minus to make overall norm positive norm_with_trend = positive_norm_with_trend - negative_norm_with_trend # stabilize also norm_with_trend += T.eq(norm_with_trend, 0) * eps in_positive_with_trend = _backward_positive_z(inputs, weights, rel_for_positive_outputs / norm_with_trend, bias) in_negative_with_trend = _backward_negative_z(inputs, weights, rel_for_negative_outputs / norm_with_trend, bias) # Minus in_negative since in_with_trend should not switch signs in_with_trend = in_positive_with_trend - in_negative_with_trend positive_norm_against_trend = positive_forward * T.lt(outputs, 0) negative_norm_against_trend = negative_forward * T.gt(outputs, 0) # minus to make overall norm positive norm_against_trend = positive_norm_against_trend - negative_norm_against_trend # stabilize also norm_against_trend += T.eq(norm_against_trend, 0) * eps in_positive_against_trend = _backward_positive_z( inputs, weights, rel_for_negative_outputs / norm_against_trend, bias ) in_negative_against_trend = _backward_negative_z( inputs, weights, rel_for_positive_outputs / norm_against_trend, bias ) # Minus in_negative since switching signs is done below in_against_trend = in_positive_against_trend - in_negative_against_trend in_relevances = a * in_with_trend - b * in_against_trend return in_relevances
def getOverlap(a, b): ''' Given (batch_size, grid_num, box_num, 4) tensors, return (batch_size, grid_num, box_num) overlap area ''' a, b = get_bound(a), get_bound(b) xmin = get_max(a, b, 0) xmax = get_min(a, b, 2) ymin = get_max(a, b, 1) ymax = get_min(a, b, 3) xside, yside = xmax-xmin, ymax-ymin xside = T.switch(T.gt(xside, 0), xside, 0) yside = T.switch(T.gt(yside, 0), yside, 0) return xside * yside
def __init__(self, alpha, m, *args, **kwargs): super(Pareto, self).__init__(*args, **kwargs) self.alpha = alpha self.m = m self.mean = tt.switch(tt.gt(alpha, 1), alpha * m / (alpha - 1.), np.inf) self.median = m * 2.**(1. / alpha) self.variance = tt.switch( tt.gt(alpha, 2), (alpha * m**2) / ((alpha - 2.) * (alpha - 1.)**2), np.inf) assert_negative_support(alpha, 'alpha', 'Pareto') assert_negative_support(m, 'm', 'Pareto')
def fprop(self, state_below): print "======fprop=====" rng = RandomStreams(seed=234) #size = theano.tensor.as_tensor_variable((state_below.shape[0], self.dim)) self.noise = rng.normal(size=(state_below.shape[0], self.dim), avg=0, std=self.std) #self.noise = T.log(un/(1-un)) p = self._linear_part(state_below) + self.noise batch_size = (p.shape[0]).astype(config.floatX) self.active_rate = T.gt(p, self.threshold).sum(axis=0, dtype=config.floatX) / batch_size return T.gt(p, self.threshold) * p
def logp(self, X): n = self.n p = self.p V = self.V IVI = det(V) IXI = det(X) return bound( ((n - p - 1) * log(IXI) - trace(matrix_inverse(V).dot(X)) - n * p * log(2) - n * log(IVI) - 2 * multigammaln(n / 2., p)) / 2, gt(n, (p - 1)), all(gt(eigh(X)[0], 0)), eq(X, X.T) )
def test_v2(self): q_matrix = self.q d_matrix = self.d[0 :: self.negative_d_num + 1] consine_vector = self.compute_cosine_between_matrixes(q_matrix, d_matrix) for i in range(1, self.negative_d_num + 1): q_matrix = self.q d_matrix = self.d[i :: self.negative_d_num + 1] consine_vector = T.concatenate([consine_vector, self.compute_cosine_between_matrixes(q_matrix, d_matrix)]) components_reshape = T.reshape(consine_vector, (self.negative_d_num + 1, self.mini_batch_size)).T gt_1 = T.sum(T.gt(components_reshape[:, 0], components_reshape[:, 1])) gt_2 = T.sum(T.gt(components_reshape[:, 0], components_reshape[:, 2])) gt_3 = T.sum(T.gt(components_reshape[:, 0], components_reshape[:, 3])) gt_4 = T.sum(T.gt(components_reshape[:, 0], components_reshape[:, 4])) gt_sum = gt_1 + gt_2 + gt_3 + gt_4 return components_reshape, gt_sum * 1.0 / (self.mini_batch_size * self.negative_d_num)
def drop(input_value, dropout): if T.gt(dropout, 0.): retain_prob = 1 - dropout mask = srng.binomial(n=1, p=retain_prob, size=input_value.shape, dtype='floatX') return input_value * mask / retain_prob else: return input_value
def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count
def step_fn(curr, r, h_prev): if T.gt(r, 0): curr_hid = self.alpha * h_prev curr_hid += (1 - self.alpha) *\ self.f(curr + T.dot(self.W2, h_prev)) else: curr_hid = self.f(curr) curr_hid = T.flatten(curr_hid) return curr_hid
def logp(self, value): mu = self.mu sigma = self.sigma nu = self.nu # This condition suggested by exGAUS.R from gamlss lp = T.switch(T.gt(nu, 0.05 * sigma), - T.log(nu) + (mu - value) / nu + 0.5 * (sigma / nu)**2 + logpow(std_cdf((value - mu) / sigma - sigma / nu), 1.), - T.log(sigma * T.sqrt(2 * np.pi)) - 0.5 * ((value - mu) / sigma)**2) return bound(lp, sigma > 0., nu > 0.)
def loss_confident_bootstrapping(self, y, factor=1): #Customized categorical cross entropy. #Based on the multibox impl. More tuned to paper. More strict p = self.output #Only confident predictions are included. Everything between 0.2 and 0.8 is disregarded. 60% of the range. hardUpper = T.gt(p, 0.8) hardLower = T.le(p, 0.2) loss = (-T.sum( ((factor * y) + ((1.0 - factor) * hardUpper)) * T.log(p)) - T.sum( ((factor * (1.0 - y)) + ((1.0 - factor) * hardLower)) * T.log(1.0 - p))) return loss / self.size
def __init__(self, alpha, beta=1, *args, **kwargs): super(InverseGamma, self).__init__(*args, **kwargs) self.alpha = alpha = tt.as_tensor_variable(alpha) self.beta = beta = tt.as_tensor_variable(beta) self.mean = self._calculate_mean() self.mode = beta / (alpha + 1.) self.variance = tt.switch(tt.gt(alpha, 2), (beta**2) / (alpha * (alpha - 1.)**2), np.inf) assert_negative_support(alpha, 'alpha', 'InverseGamma') assert_negative_support(beta, 'beta', 'InverseGamma')
def logp(self, value): mu = self.mu alpha = self.alpha negbinom = bound(binomln(value + alpha - 1, value) + logpow(mu / (mu + alpha), value) + logpow(alpha / (mu + alpha), alpha), value >= 0, mu > 0, alpha > 0) # Return Poisson when alpha gets very large. return tt.switch(tt.gt(alpha, 1e10), Poisson.dist(self.mu).logp(value), negbinom)
def loss_crosstrapping(self, y, factor=1): #Almost the same as bootstrapping, except mean used for overall result. #More closely follows crossentropy implementation. #When factor is 1, crossentropy equals this implementation. So performance #without decreasing factor should be the same! p = self.output hard = T.gt(p, 0.5) cross = -(((factor * y * T.log(p)) + ((1.0 - factor) * hard * T.log(p))) + ((factor * (1.0 - y) * T.log(1.0 - p)) + ((1.0 - factor) * (1.0 - hard) * T.log(1.0 - p)))) return T.mean(cross)
def step(i, x, *args): x_i = x[T.arange(x.shape[0]), i] x_reversed = T.set_subtensor(x_i, 1.0 - x_i) merged = T.concatenate([x, x_reversed], axis=0) eng = energy(merged).flatten() eng_x = eng[:x.shape[0]] eng_r = eng[x.shape[0]:] cond = T.gt(eng_x, eng_r) # The update values updated = T.switch(cond, x_i, 1.0 - x_i) return T.set_subtensor(x_i, updated)
def in_transit(self, t, r=0.0, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) + z R = self.r_star + z # Wrap the times into time since transit hp = 0.5 * self.period dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp if self.ecc is None: # Equation 14 from Winn (2010) k = r / self.r_star arg = tt.square(1 + k) - tt.square(self.b) hdur = hp * tt.arcsin(self.r_star / self.a * tt.sqrt(arg) / self.sin_incl) / np.pi t_start = -hdur t_end = hdur flag = z else: M_contact = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) flag = M_contact[2] t_start = (M_contact[0] - self.M0) / self.n t_start = tt.mod(t_start + hp, self.period) - hp t_end = (M_contact[1] - self.M0) / self.n t_end = tt.mod(t_end + hp, self.period) - hp if texp is not None: t_start -= 0.5*texp t_end += 0.5*texp mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1) result = ifelse(tt.and_(tt.all(tt.eq(flag, 0)), tt.all(tt.gt(t_end, t_start))), tt.arange(t.size)[mask], tt.arange(t.size)) return result
def logp(self, value): psi = self.psi theta = self.theta logp_val = tt.switch(tt.gt(value, 0), tt.log(psi) + self.pois.logp(value), logaddexp(tt.log1p(-psi), tt.log(psi) - theta)) return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, 0 <= theta)
def __init__(self, f, θs, α=0.001, β1=0.9, β2=0.999, β3=0.999, k=0.1, K=10., ε=1e-8, dec=0.): α, β1, β2, β3, ε, dec = [ np.cast[floatX](h) for h in [α, β1, β2, β3, ε, dec] ] t = theano.shared(0, name="t") t_u = (t, t + 1) f_prev = theano.shared(np.cast[floatX](0), name="f_prev") ch_fact_lbound = T.switch(T.gt(f, f_prev), 1 + k, 1 / (1 + K)) ch_fact_ubound = T.switch(T.gt(f, f_prev), 1 + K, 1 / (1 + k)) f_ch_fact = f / f_prev f_ch_fact = T.switch(T.lt(f_ch_fact, ch_fact_lbound), ch_fact_lbound, f_ch_fact) f_ch_fact = T.switch(T.gt(f_ch_fact, ch_fact_ubound), ch_fact_ubound, f_ch_fact) f_hat = T.switch(T.gt(t_u[1], 1), f_prev * f_ch_fact, f) f_u = (f_prev, f_hat) self.ms = [ theano.shared(np.zeros(θ.shape.eval(), dtype=floatX), borrow=True, name="m") for θ in θs ] self.vs = [ theano.shared(np.zeros(θ.shape.eval(), dtype=floatX), borrow=True, name="v") for θ in θs ] d = theano.shared(one, name="d") d_den = T.switch(T.gt(f_hat, f_prev), f_prev, f_hat) d_t = (β3 * d) + (one - β3) * T.abs_((f_hat - f_prev) / d_den) d_t = T.switch(T.gt(t_u[1], one), d_t, one) d_u = (d, d_t) gs = T.grad(f, θs) m_us = [(m, β1 * m + (one - β1) * g) for m, g in zip(self.ms, gs)] m_hats = [m_u[1] / (one - T.pow(β1, t_u[1])) for m_u in m_us] v_us = [(v, β2 * v + (one - β2) * T.sqr(g)) for v, g in zip(self.vs, gs)] v_hats = [v_u[1] / (one - T.pow(β2, t_u[1])) for v_u in v_us] θ_us = [(θ, θ - (α / (one + (t_u[1] * dec))) * m_hat / ((T.sqrt(v_hat) * d_t) + ε)) for θ, m_hat, v_hat in zip(θs, m_hats, v_hats)] self.updates = m_us + v_us + [t_u, f_u, d_u] + θ_us
def multiclassRealPosAndNegAndTruePredPosNegTraining0OrValidation1( self, y, training0OrValidation1): """ The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). For class_i == 0 (backgr), what is reported is the WHOLE rp,rn,tpp,tpn. ie, as calculated considering background VS all other classes. Order in the list is the natural order of the classes (ie class-0-WHOLE RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) """ returnedListWithNumberOfRpRnPpPnForEachClass = [] for class_i in xrange(0, self.numberOfOutputClasses): #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). vectorOneAtRealPositives = T.gt(y, 0) if class_i == 0 else T.eq( y, class_i) vectorOneAtRealNegatives = T.eq(y, 0) if class_i == 0 else T.neq( y, class_i) if training0OrValidation1 == 0: #training: yPredToUse = self.y_pred else: #validation yPredToUse = self.y_pred_inference vectorOneAtPredictedPositives = T.gt( yPredToUse, 0) if class_i == 0 else T.eq(yPredToUse, class_i) vectorOneAtPredictedNegatives = T.eq( yPredToUse, 0) if class_i == 0 else T.neq(yPredToUse, class_i) vectorOneAtTruePredictedPositives = T.and_( vectorOneAtRealPositives, vectorOneAtPredictedPositives) vectorOneAtTruePredictedNegatives = T.and_( vectorOneAtRealNegatives, vectorOneAtPredictedNegatives) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealPositives)) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealNegatives)) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedPositives)) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedNegatives)) return returnedListWithNumberOfRpRnPpPnForEachClass
def _step( i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r ): xk = -(x * k1 * k2) / (k3 * k4) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk xk = (x * k5 * k6) / (k7 * k8) pk = pkm1 + pkm2 * xk qk = qkm1 + qkm2 * xk pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk old_r = r r = tt.switch(tt.eq(qk, zero), r, pk/qk) k1 += one k2 += k26update k3 += two k4 += two k5 += one k6 -= k26update k7 += two k8 += two big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG) biginv_cond = tt.or_( tt.lt(tt.abs_(qk), BIGINV), tt.lt(tt.abs_(pk), BIGINV) ) pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2) pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1) qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2) qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1) pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2) pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1) qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2) qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1) return ((pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))))
def from_partial_old(self, X, dX): eps = 1e-10 #np.spacing(1) U, S, V = X dU, dS, dV = dX S = tensor.diag(S) S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0) S_pinv = tensor.diag(S_pinv) ZV = dU.dot(S_pinv) UtZV = dS ZtU = S_pinv.dot(dV) Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V))) return Zproj
def triplet_loss(predictions, triplets): # Valeur de alpha a = np.float32(0.2) # On calcul les distances entre les representations de anchor/positive et anchor/negative dist1 = ((predictions[triplets[:,0]] - predictions[triplets[:,1]])**2).sum(axis=1) dist2 = ((predictions[triplets[:, 0]] - predictions[triplets[:, 2]]) ** 2).sum(axis=1) s = dist1 - dist2 + a # On calcul la loss loss = s * T.gt(s, 0.0) return loss
def build_target_label_prediction(self, valid_out, loss_type, K): """ Picks the target vector for each class, as well as a strategy for picking the predicted label from the network output """ if re.search('one-hot', loss_type): # should be used with softmax out identity = numpy.identity(K) pred_valid = T.argmax(valid_out, axis=1) klass_targets = identity elif re.search('nnrank', loss_type): # should be used with sigmoid out if self.num_output_classes == K: nnrank_target = numpy.tril(numpy.ones((K,K))) pred_valid = T.sum(T.gt(valid_out, 0.5), axis=1) - 1 # can potentially return -1 elif self.num_output_classes == (K-1): nnrank_target = numpy.array([[0]*(K-1)] + numpy.tril(numpy.ones((K-1,K-1))).tolist()) # TODO check for discontinuities rather than assuming none with the sum # TODO do better than a shared threshold pred_valid = T.sum(T.gt(valid_out, 0.5), axis=1) klass_targets = nnrank_target return(theano.shared(lasagne.utils.floatX((klass_targets))),pred_valid)
def define_test_functions(disc_nonlinearity, prediction, prediction_det, target_var_sup): if disc_nonlinearity in ["sigmoid", "softmax", "softmax_hierarchy"]: if disc_nonlinearity == "sigmoid": test_pred = T.gt(prediction_det, 0.5) test_acc = T.mean(T.eq(test_pred, target_var_sup), dtype=theano.config.floatX) * 100. elif disc_nonlinearity in ["softmax", "softmax_hierarchy"]: test_pred = prediction_det.argmax(1) test_acc = T.mean(T.eq(test_pred, target_var_sup.argmax(1)), dtype=theano.config.floatX) * 100 return test_acc, test_pred
def logp(self, value): psi = self.psi p = self.p n = self.n logp_val = tt.switch( tt.gt(value, 0), tt.log(psi) + self.bin.logp(value), logaddexp(tt.log1p(-psi), tt.log(psi) + n * tt.log1p(-p))) return bound(logp_val, 0 <= value, value <= n, 0 <= psi, psi <= 1, 0 <= p, p <= 1)
def get_output_for(self, input, deterministic=False, **kwargs): # print(super(snn_denseLayer, self).get_output_for(input, **kwargs)) self.input = input v = self.v_in + super(snn_denseLayer, self).get_output_for( input, **kwargs) # v=super(snn_denseLayer, self).get_output_for(input, **kwargs) vmax = T.max(v) flag = T.gt(vmax, self.threshold) self.output_spike = T.switch(T.eq(vmax, v), flag, 0.0) self.v_out = flag * self.refractory_voltage + (1.0 - flag) * v #sample_net.do_stdp() return self.output_spike
def logp(self, value): alpha = self.alpha mu = self.mu psi = self.psi logp_val = tt.switch(tt.gt(value, 0), tt.log(psi) + self.nb.logp(value), logaddexp(tt.log1p(-psi), tt.log(psi) + alpha * (tt.log(alpha) - tt.log(alpha + mu)))) return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, mu > 0, alpha > 0)
def triplet_loss(predictions, triplets): #loss = 0.0 a = np.float32(0.2) dist1 = ((predictions[triplets[:, 0]] - predictions[triplets[:, 1]])**2).sum(axis=1) dist2 = ((predictions[triplets[:, 0]] - predictions[triplets[:, 2]])**2).sum(axis=1) s = dist1 - dist2 + a loss = s * T.gt(s, 0.0) return loss
def _get_updates_for(self, param, grad): grad_tm1 = util.shared_like(param, 'grad') step_tm1 = util.shared_like(param, 'step', self.learning_rate.eval()) test = grad * grad_tm1 diff = TT.lt(test, 0) steps = step_tm1 * (TT.eq(test, 0) + TT.gt(test, 0) * self.step_increase + diff * self.step_decrease) step = TT.minimum(self.max_step, TT.maximum(self.min_step, steps)) grad = grad - diff * grad yield param, TT.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def __init__(self,input,truth,mask): self.output_shape = input.output_shape Layer.linkstruct[input].append(self) #2 parts, 0<o<t t<o diff = truth.resp*0.8 - input.output diff = T.switch(T.gt(diff,0),diff,0) diff = T.switch(T.eq(truth.resp,0),1,diff) diffsmul = T.prod(diff, axis=1) loss = T.sum(diffsmul * mask) self.loss = loss self.output = truth.resp self.output_shape = truth.resp_shape
def OneStep(alpha1, b1, alpha2, b2): alpha1_new = (T.abs_(b1 * D * W).sum() / T.abs_(b1 * D).sum()).astype('float32') b1_new = T.switch(T.gt(W / alpha1_new, 0.5), 1., 0.) alpha2_new = (T.abs_(b2 * D * W).sum() / T.abs_(b2 * D).sum()).astype('float32') b2_new = T.switch(T.lt(W / alpha2_new, -0.5), -1., 0.) delta1 = T.abs_(alpha1_new - alpha1) delta2 = T.abs_(alpha2_new - alpha2) condition = T.lt(delta1, 1e-6) and T.lt(delta2, 1e-6) return [alpha1_new, b1_new, alpha2_new, b2_new], theano.scan_module.until(condition)
def get_monitoring_channels_from_state(self, state, target=None): channels = super(MultiSigmoid, self).get_monitoring_channels_from_state( state, target) for c in channels: if 'misclass' in c: del channels[c] z, = state.owner.inputs geo = T.nnet.sigmoid(z.mean(axis=1).dimshuffle(0, 'x')) geo_class = T.gt(geo, 0.5) misclass = T.cast(T.neq(geo_class, target), config.floatX).mean() channels['misclass'] = misclass return channels
def __init__(self, input, n_in, n_out, is_binary=False, threshold=0.4, rng=None): """ Initialize the parameters of the logistic regression. :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie (number of classes) """ self.activation = T.nnet.sigmoid self.threshold = threshold super(LogisticRegressionLayer, self).__init__(input, n_in, n_out, self.activation, rng) self.reset_layer() self.is_binary = is_binary if n_out == 1: self.is_binary = True # The number of classes self.n_classes_seen = np.zeros(n_out) # The number of the wrong classification madefor the class i self.n_wrong_classif_made = np.zeros(n_out) self.reset_conf_mat() # Compute vector class-membership probablities in symbolic form # self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+ self.b) self.p_y_given_x = self.get_class_memberships(self.input) if not self.is_binary: # Compute prediction as class whose probability is maximal # in symbolic form self.y_decision = T.argmax(self.p_y_given_x, axis=1) else: # If the probability is greater than the specified threshold # assign to the class 1, otherwise it is 0. Which alos can be # checked if p(y=1|x) > threshold. self.y_decision = T.gt(T.flatten(self.p_y_given_x), self.threshold) self.params = [self.W, self.b]
def rprop(param, learning_rate, gparam, mask, updates, current_cost, previous_cost, eta_plus=1.2, eta_minus=0.5, max_delta=50, min_delta=10e-6): previous_grad = sharedX(numpy.ones(param.shape.eval()), borrow=True) delta = sharedX(learning_rate * numpy.ones(param.shape.eval()), borrow=True) previous_inc = sharedX(numpy.zeros(param.shape.eval()), borrow=True) zero = T.zeros_like(param) one = T.ones_like(param) change = previous_grad * gparam new_delta = T.clip( T.switch( T.eq(gparam, 0.), delta, T.switch(T.gt(change, 0.), delta * eta_plus, T.switch(T.lt(change, 0.), delta * eta_minus, delta))), min_delta, max_delta) new_previous_grad = T.switch( T.eq(mask * gparam, 0.), previous_grad, T.switch(T.gt(change, 0.), gparam, T.switch(T.lt(change, 0.), zero, gparam))) inc = T.switch( T.eq(mask * gparam, 0.), zero, T.switch(T.gt(change, 0.), -T.sgn(gparam) * new_delta, T.switch(T.lt(change, 0.), zero, -T.sgn(gparam) * new_delta))) updates.append((previous_grad, new_previous_grad)) updates.append((delta, new_delta)) updates.append((previous_inc, inc)) return param + inc * mask
def momentum(cost, params, current_epoch, lr, init_momentum): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): vel = theano.shared(p.get_value() * 0.) current_momentum = theano.shared(floatX(init_momentum)) vel_new = current_momentum * vel - lr * g momentum_new = T.le(current_epoch, 200.) * ( current_epoch * (0.99 - 0.09) / 200. + 0.09) + T.gt(current_epoch, 200.) * 0.99 updates.append((vel, vel_new)) updates.append((p, p + vel_new)) updates.append((current_momentum, momentum_new)) return updates
def find_right_bound(prev_func_output, step, maxstep): func_output = f(step) is_output_decrease = T.gt(prev_func_output, func_output) step = ifelse( is_output_decrease, T.minimum(2. * step, maxstep), step ) is_output_increse = T.lt(prev_func_output, func_output) stoprule = theano.scan_module.until( T.or_(is_output_increse, step > maxstep) ) return [func_output, step], stoprule
def __mapper(self, train_example): pos_triple, neg_triple = train_example[0:3], train_example[3:] unconstrained_objective = self.margin - self.__objective_triple(neg_triple) \ + self.__objective_triple(pos_triple) entity_normalize = T.sum(T.square(self.Entity.norm(2, axis=0)) - 1) relation_normalize = T.square(self.Relation.norm(2, axis=0)) surface_normalize = T.square(T.diagonal(T.dot(self.RelationNormal.T, self.Relation))) / relation_normalize surface_normalize = T.sum(surface_normalize - self.epsilon ** 2) unconstrained_objective_positive = ifelse(T.gt(unconstrained_objective, theano.shared(0.0)), unconstrained_objective, theano.shared(0.0)) entity_normalize_positive = ifelse(T.gt(entity_normalize, theano.shared(0.0)), entity_normalize, theano.shared(0.0)) surface_normalize_positive = ifelse(T.gt(surface_normalize, theano.shared(0.0)), surface_normalize, theano.shared(0.0)) return unconstrained_objective_positive + self.regularize_factor \ * (surface_normalize_positive + entity_normalize_positive)
def get_probs(self): t = self.temperatures t_term = (1. / t - T.roll(1. / t, shift=-1)) t_term = T.set_subtensor(t_term[-1], 0) e_term = self.energy_(self.pps) - T.roll(self.energy_(self.pps), shift=-1) e_term = T.set_subtensor(e_term[-1], 0.) probs = T.exp(t_term * e_term) actions = T.cast(T.gt(probs, self.t_rng.uniform((probs.shape))), fx) add = T.concatenate([[np.cast[fx](0.)], actions]) add = T.roll(add, shift=-1) - add add = add[:-1] add = T.switch(T.gt(add, 0), 1., 0.) add = T.set_subtensor(add[-1], 0.) add = add - T.roll(add, shift=1) idx = T.arange(actions.shape[0], dtype=fx) idx = idx + add return self.energy_(self.pps)