def advanced_indexing(volume, *indices_list, **kwargs): """ Performs advanced indexing on `volume`. This function exists because in Theano<=0.9 advanced indexing is only supported along the first dimension. Notes ----- Assuming `volume` is C contiguous. """ strides = kwargs.get("strides") if strides is None: shapes = T.cast(volume.shape[:len(indices_list)], dtype=theano.config.floatX) strides = T.concatenate([T.ones((1,)), T.cumprod(shapes[::-1])[:-1]], axis=0)[::-1] shapes = T.cast(volume.shape, dtype=theano.config.floatX) indices = T.maximum(0, T.minimum(indices_list[-1], shapes[len(indices_list)-1]-1)) for i in range(len(indices_list)-1): clipped_idx = T.maximum(0, T.minimum(indices_list[i], shapes[i]-1)) indices += clipped_idx * strides[i] # indices = T.sum(T.stack(indices_list, axis=1)*strides[:len(indices_list)], axis=1) indices = T.cast(indices, dtype="int32") return volume.reshape((-1, volume.shape[-1]))[indices]
def get(self, y_p, i, g): W_att_re = self.item("W_att_re", i) b_att_re = self.item("b_att_re", i) B = self.item("B", i) C = self.item("C", i) I = self.item("I", i) beam_size = T.minimum(numpy.int32(abs(self.attrs['beam'])), C.shape[0]) loc = T.cast(T.maximum(T.minimum(T.sum(I,axis=0) * self.n / self.bound - beam_size / 2, T.sum(I,axis=0) - beam_size), 0),'int32') if self.attrs['beam'] > 0: beam_idx = (self.custom_vars[('P_%d' % i)][loc].dimshuffle(1,0).flatten() > 0).nonzero() I = I.reshape((I.shape[0]*I.shape[1],))[beam_idx].reshape((beam_size,I.shape[1])) C = C.reshape((C.shape[0]*C.shape[1],C.shape[2]))[beam_idx].reshape((beam_size,C.shape[1],C.shape[2])) B = B.reshape((B.shape[0]*B.shape[1],B.shape[2]))[beam_idx].reshape((beam_size,B.shape[1],B.shape[2])) if self.attrs['template'] != self.layer.unit.n_out: z_p = T.dot(y_p, W_att_re) + b_att_re else: z_p = y_p if self.attrs['momentum'] == 'conv1d': from theano.tensor.nnet import conv att = self.item('att', i) F = self.item("F", i) v = T.dot(T.sum(conv.conv2d(border_mode='full', input=att.dimshuffle(1, 'x', 0, 'x'), filters=F).dimshuffle(2,3,0,1),axis=1)[F.shape[2]/2:-F.shape[2]/2+1],self.item("U",i)) v = I * v / v.sum(axis=0,keepdims=True) z_p += T.sum(C * v,axis=0) if g > 0: z_p += self.glimpses[i][-1] h_p = T.tanh(z_p) return B, C, I, h_p, self.item("W_att_in", i), self.item("b_att_in", i)
def call(self, X): if type(X) is not list or len(X) != 2: raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X)) frame, position = X[0], X[1] # Reshaping the input to exclude the time dimension frameShape = K.shape(frame) positionShape = K.shape(position) (chans, height, width) = frameShape[-3:] targetDim = positionShape[-1] frame = K.reshape(frame, (-1, chans, height, width)) position = K.reshape(position, (-1, ) + (targetDim, )) # Applying the attention hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0 hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0 position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0) position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0) rX = Data.linspace(-1.0, 1.0, width) rY = Data.linspace(-1.0, 1.0, height) FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x')) FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x')) m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1) m = m + self.alpha - THT.gt(m, 0.) * self.alpha frame = frame * m.dimshuffle(0, 'x', 1, 2) # Reshaping the frame to include time dimension output = K.reshape(frame, frameShape) return output
def lp_norm(self, n, k, r, c, z): ''' Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P) :param n: :param k: :param r: :param c: :param z: :return: ''' ds0, ds1 = self.pool_size st0, st1 = self.stride pad_h = self.pad[0] pad_w = self.pad[1] row_st = r * st0 row_end = T.minimum(row_st + ds0, self.img_rows) row_st = T.maximum(row_st, self.pad[0]) row_end = T.minimum(row_end, self.x_m2d + pad_h) col_st = c * st1 col_end = T.minimum(col_st + ds1, self.img_cols) col_st = T.maximum(col_st, self.pad[1]) col_end = T.minimum(col_end, self.x_m1d + pad_w) Lp = T.pow( T.mean(T.pow( T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)), 1 + T.log(1 + T.exp(self.P)) )), 1 / (1 + T.log(1 + T.exp(self.P))) ) return T.set_subtensor(z[n, k, r, c], Lp)
def _output(self, input, *args, **kwargs): k = (self.alpha - 1).reshape(self.filter_shape) if self.affected_channels == self.n_channel: return input + T.minimum(0, input) * k else: affected = input[:, :self.affected_channels] unaffected = input[:, self.affected_channels:] affected = affected + T.minimum(0, affected) * k return T.concatenate([affected, unaffected], axis=1)
def _interpolate(im, x, y, out_height, out_width): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # clip coordinates to [-1, 1] x = T.clip(x, -1, 1) y = T.clip(y, -1, 1) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. for # indexing, we need to take care they do not extend past the image. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 x0 = T.cast(x0_f, 'int64') y0 = T.cast(y0_f, 'int64') x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64') y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = T.repeat( T.arange(num_batch, dtype='int64')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a] Ib = im_flat[idx_b] Ic = im_flat[idx_c] Id = im_flat[idx_d] # calculate interpolated values wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x') wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x') wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x') wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x') output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) assert str(output.dtype) == theano.config.floatX, str(output.dtype) return output
def _interpolate(im, x, y, out_height, out_width, num_b): _, height, width, channels = im.shape # *_f are floats height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # clip coordinates to [-1, 1] x = T.clip(x, -1, 1) y = T.clip(y, -1, 1) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. for # indexing, we need to take care they do not extend past the image. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 # KMYI: we cast only at the end to maximize GPU usage x0 = T.floor(x0_f) y0 = T.floor(y0_f) x1 = T.floor(T.minimum(x1_f, width_f - 1)) y1 = T.floor(T.minimum(y1_f, height_f - 1)) dim2 = width_f dim1 = width_f * height_f base = T.repeat( T.arange(num_b, dtype=theano.config.floatX) * dim1, out_height * out_width) base_y0 = base + y0 * dim2 base_y1 = base + y1 * dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[T.cast(idx_a, 'int64')] Ib = im_flat[T.cast(idx_b, 'int64')] Ic = im_flat[T.cast(idx_c, 'int64')] Id = im_flat[T.cast(idx_d, 'int64')] # calculate interpolated values wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x') wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x') wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x') wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x') output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0) return output
def _interpolate(im, x, y, out_height, out_width, dtype = 'float32'): # *_f are floats num_batch, height, width, channels = im.shape height_f = T.cast(height, dtype = dtype) width_f = T.cast(width, dtype = dtype) # scale coordinates from [-1, 1] to [0, width/height - 1] idx = ((x >= 0) & (x <= 1) & (y >= 0) & (y <= 1)).nonzero()[0] # x = (x + 1) / 2 * (width_f - 1) # y = (y + 1) / 2 * (height_f - 1) x = x * (width_f - 1) y = y * (height_f - 1) # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates; # we need those in floatX for interpolation and in int64 for indexing. for # indexing, we need to take care they do not extend past the image. x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 x0 = T.cast(x0_f, 'int64') y0 = T.cast(y0_f, 'int64') x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64') y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width*height base = T.repeat( T.arange(num_batch, dtype='int64')*dim1, out_height*out_width) base_y0 = base + y0*dim2 base_y1 = base + y1*dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples im_flat = im.reshape((-1, channels)) Ia = im_flat[idx_a[idx]] Ib = im_flat[idx_b[idx]] Ic = im_flat[idx_c[idx]] Id = im_flat[idx_d[idx]] # calculate interpolated values wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')[idx, :] wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')[idx, :] wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')[idx, :] wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')[idx, :] output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0) # out = T.zeros_like(((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')) out = T.zeros_like(im_flat) return T.set_subtensor(out[idx, :], output)
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros(n_in)+1) te_last = create_shared_variable(np.zeros(n_out)+1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last * rx**(tx_last[x_spike_ixs, None]-t_last) * re**(te_last[None, :]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs] * rx**(new_tx_last[:, None]-t_last) * re**(te_last[None, e_spike_ixs]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) add_update(x_last, new_x_last) add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e))) add_update(tx_last, new_tx_last+1) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws
def _log_add_3(log_a, log_b, log_c): """Theano expression for log(a+b+c) given log(a), log(b), log(c).""" smaller = T.minimum(log_a, log_b) larger = T.maximum(log_a, log_b) largest = T.maximum(larger, log_c) larger = T.minimum(larger, log_c) return largest + T.log1p( T.exp(smaller - largest) + T.exp(larger - largest) )
def clip_boxes(boxes, im_shape): """ Clip boxes to image boundaries. """ # x1 >= 0 boxes = T.set_subtensor(boxes[:, 0::4], T.maximum(T.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)) # y1 >= 0 boxes = T.set_subtensor(boxes[:, 1::4], T.maximum(T.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)) # x2 < im_shape[1] boxes = T.set_subtensor(boxes[:, 2::4], T.maximum(T.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)) # y2 < im_shape[0] boxes = T.set_subtensor(boxes[:, 3::4], T.maximum(T.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)) return boxes
def perform(self, x): Pmax = self.params[0] Pmin = self.params[1] if x.ndim==3: Pmin = Pmin.dimshuffle('x', 'x', 0) Pmax = Pmax.dimshuffle('x', 'x', 0) return T.minimum(T.maximum(Pmin, x), Pmax) else: Pmin = Pmin.dimshuffle('x', 0) Pmax = Pmax.dimshuffle('x', 0) return T.minimum(T.maximum(Pmin, x), Pmax)
def perform(self, x): EPSI = 1e-6 Pmax = self.params[0] Pmin = self.params[1] if x.ndim==3: Pmin = Pmin.dimshuffle('x', 'x', 0) Pmax = Pmax.dimshuffle('x', 'x', 0) return (T.minimum(T.maximum(Pmin, x), Pmax)-Pmin)/(Pmax-Pmin+EPSI) else: Pmin = Pmin.dimshuffle('x', 0) Pmax = Pmax.dimshuffle('x', 0) return (T.minimum(T.maximum(Pmin, x), Pmax)-Pmin)/(Pmax-Pmin+EPSI)
def create_activation(activation): '''Given an activation description, return a callable that implements it. Parameters ---------- activation : string A string description of an activation function to use. Returns ------- activation : callable(float) -> float A callable activation function. ''' def compose(a, b): c = lambda z: b(a(z)) c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__) return c if '+' in activation: return functools.reduce( compose, (create_activation(a) for a in activation.split('+'))) options = { 'tanh': TT.tanh, 'linear': lambda z: z, 'logistic': TT.nnet.sigmoid, 'sigmoid': TT.nnet.sigmoid, 'softplus': TT.nnet.softplus, 'softmax': softmax, # rectification 'relu': lambda z: TT.maximum(0, z), 'trel': lambda z: TT.maximum(0, TT.minimum(1, z)), 'trec': lambda z: TT.maximum(1, z), 'tlin': lambda z: z * (abs(z) > 1), # modifiers 'rect:max': lambda z: TT.minimum(1, z), 'rect:min': lambda z: TT.maximum(0, z), # normalization 'norm:dc': lambda z: z - z.mean(axis=-1, keepdims=True), 'norm:max': lambda z: z / TT.maximum(TT.cast(1e-7, FLOAT), abs(z).max(axis=-1, keepdims=True)), 'norm:std': lambda z: z / TT.maximum(TT.cast(1e-7, FLOAT), TT.std(z, axis=-1, keepdims=True)), 'norm:z': lambda z: (z - z.mean(axis=-1, keepdims=True)) / TT.maximum(TT.cast(1e-7, FLOAT), z.std(axis=-1, keepdims=True)), } for k, v in options.items(): v.__theanets_name__ = k try: return options[activation.lower()] except KeyError: raise KeyError('unknown activation {}'.format(activation))
def __step(img, prev_bbox, state, timestep): conv1 = conv2d(img, conv1_filters, subsample=(conv1_stride, conv1_stride), border_mode='half') act1 = NN.relu(conv1) flat1 = TT.reshape(act1, (-1, conv1_output_dim)) gru_in = TT.concatenate([flat1, prev_bbox], axis=1) gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz) gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br) gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg) gru_h = (1 - gru_z) * state + gru_z * gru_h_ bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2) bbox_cx = ((bbox[:, 2] + bbox[:, 0]) / 2 + 1) / 2 * img_row bbox_cy = ((bbox[:, 3] + bbox[:, 1]) / 2 + 1) / 2 * img_col bbox_w = TT.abs_(bbox[:, 2] - bbox[:, 0]) / 2 * img_row bbox_h = TT.abs_(bbox[:, 3] - bbox[:, 1]) / 2 * img_col x = TT.arange(img_row, dtype=T.config.floatX) y = TT.arange(img_col, dtype=T.config.floatX) mx = TT.maximum(TT.minimum(-TT.abs_(x.dimshuffle('x', 0) - bbox_cx.dimshuffle(0, 'x')) + bbox_w.dimshuffle(0, 'x') / 2., 1), 1e-4) my = TT.maximum(TT.minimum(-TT.abs_(y.dimshuffle('x', 0) - bbox_cy.dimshuffle(0, 'x')) + bbox_h.dimshuffle(0, 'x') / 2., 1), 1e-4) bbox_mask = mx.dimshuffle(0, 1, 'x') * my.dimshuffle(0, 'x', 1) new_cls1_f = cls_f new_cls1_b = cls_b mask = act1 * bbox_mask.dimshuffle(0, 'x', 1, 2) new_featmaps = TG.disconnected_grad(TT.set_subtensor(featmaps[:, timestep], mask)) new_featmaps.name = 'new_featmaps' new_probmaps = TG.disconnected_grad(TT.set_subtensor(probmaps[:, timestep], bbox_mask)) new_probmaps.name = 'new_probmaps' train_featmaps = TG.disconnected_grad(new_featmaps[:, :timestep+1].reshape(((timestep + 1) * batch_size, conv1_nr_filters, img_row, img_col))) train_featmaps.name = 'train_featmaps' train_probmaps = TG.disconnected_grad(new_probmaps[:, :timestep+1]) train_probmaps.name = 'train_probmaps' for _ in range(0, 5): train_convmaps = conv2d(train_featmaps, new_cls1_f, subsample=(cls1_stride, cls1_stride), border_mode='half').reshape((batch_size, timestep + 1, batch_size, img_row, img_col)) train_convmaps.name = 'train_convmaps' train_convmaps_selected = train_convmaps[TT.arange(batch_size).repeat(timestep+1), TT.tile(TT.arange(timestep+1), batch_size), TT.arange(batch_size).repeat(timestep+1)].reshape((batch_size, timestep+1, img_row, img_col)) train_convmaps_selected.name = 'train_convmaps_selected' train_predmaps = NN.sigmoid(train_convmaps_selected + new_cls1_b.dimshuffle(0, 'x', 'x', 'x')) train_loss = NN.binary_crossentropy(train_predmaps, train_probmaps).mean() train_grad_cls1_f, train_grad_cls1_b = T.grad(train_loss, [new_cls1_f, new_cls1_b]) new_cls1_f -= train_grad_cls1_f * 0.1 new_cls1_b -= train_grad_cls1_b * 0.1 return (bbox, gru_h, timestep + 1, mask, bbox_mask), {cls_f: TG.disconnected_grad(new_cls1_f), cls_b: TG.disconnected_grad(new_cls1_b), featmaps: TG.disconnected_grad(new_featmaps), probmaps: TG.disconnected_grad(new_probmaps)}
def get_constraint_updates(self): constraint_updates = OrderedDict() if self.flags['wv_norm'] == 'unit': constraint_updates[self.Wv] = self.Wv / self.norm_wv elif self.flags['wv_norm'] == 'max_unit': constraint_updates[self.Wv] = self.Wv / self.norm_wv * T.minimum(self.norm_wv, 1.0) if self.flags['scalar_lambd']: constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd) ## Enforce sparsity pattern on g if required ## if self.sparse_gmask: constraint_updates[self.Wg] = self.Wg * self.sparse_gmask.mask.T ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param) return constraint_updates
def learning_rate_updates(self): """ Compute a dictionary of shared variable updates related to annealing the learning rate. Returns ------- updates : dict A dictionary with the shared variables representing SGD metadata as keys and a symbolic expression of how they are to be updated as values. """ ups = {} # Annealing coefficient. Here we're using a formula of # min(base_lr, anneal_start / (iteration + 1)) if self.anneal_start is None: annealed = sharedX(self.base_lr) else: frac = self.anneal_start / (self.iteration + 1.) annealed = tensor.minimum( as_floatX(frac), self.base_lr # maximum learning rate ) # Update the shared variable for the annealed learning rate. ups[self.annealed] = annealed ups[self.iteration] = self.iteration + 1 # Calculate the learning rates for each parameter, in the order # they appear in self.params learn_rates = [annealed * self.learning_rates[p] for p in self.params] return ups, learn_rates
def _step(self, xsum_t, xmax_t, xmin_t, xsubt_t, xmul_t, xres_t, xone_t, xi_t, xf_t, xo_t, xc_t, mask_tm1, ### add op's input x h_tm1, c_tm1, u_sum, u_max, u_min, u_subt, u_mul, u_res, u_one, u_i, u_f, u_o, u_c): ### add gate weight u_ s h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c)) a0_i = self.inner_activation(xi_t + T.dot(h_mask_tm1, u_i)) ### gate activations a1_f = self.inner_activation(xf_t + T.dot(h_mask_tm1, u_f)) a2_sum = self.inner_activation(xsum_t + T.dot(h_mask_tm1, u_sum)) a3_max = self.inner_activation(xmax_t + T.dot(h_mask_tm1, u_max)) a4_min = self.inner_activation(xmin_t + T.dot(h_mask_tm1, u_min)) a5_subt = self.inner_activation(xsubt_t + T.dot(h_mask_tm1, u_subt)) a6_mul = self.inner_activation(xmul_t + T.dot(h_mask_tm1, u_mul)) a7_res = self.inner_activation(xres_t + T.dot(h_mask_tm1, u_res)) a8_one = self.inner_activation(xone_t + T.dot(h_mask_tm1, u_one)) g0_forget = c_mask_tm1 g1_input = c_tilda g2_sum = (c_mask_tm1 + c_tilda) g3_max = T.maximum(c_mask_tm1, c_tilda) g4_min = T.minimum(c_mask_tm1, c_tilda) g5_sub = c_mask_tm1 - c_tilda g6_mul = c_mask_tm1 * c_tilda g7_res = 0 * c_tilda g8_one = 0 * c_tilda + 1 c_t = a0_i * g0_forget + a1_f * g1_input + a2_sum * g2_sum + a3_max * g3_max + a4_min * g4_min + a5_subt * g5_sub + a6_mul * g6_mul + a7_res * g7_res + a8_one * g8_one ### update cell o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o)) h_t = o_t * self.activation(c_t) return h_t, c_t
def get_output_for(self, input, only_at_anchor=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) # ## calculate attention anchor position based on atw, atb and input x at_anchor = nonlinearities.rectify(T.dot(input, self.atw) + self.atb[0]) at_anchor = T.minimum(at_anchor, 1) at_anchor *= self.num_units self.at_anchor = at_anchor # for printing # print_op = printing.Print('attention') # at_anchor = print_op(at_anchor) if only_at_anchor: return at_anchor # ## normal dense layer activation output activation = T.dot(input, self.W) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) out = self.nonlinearity(activation) ### multiply activation with attention weight attention = T.exp( self.at_decay * ( T.arange(0, self.num_units).dimshuffle('x', 0) - at_anchor.dimshuffle(0, 'x') ) ** 2) out *= attention return out
def smorms3(cost, params, lrate=1e-3, eps=1e-16, gather=False): updates = [] optim_params = [] grads = T.grad(cost, params) for p, grad in zip(params, grads): mem = sharedX(p.get_value() * 0. + 1.) g = sharedX(p.get_value() * 0.) g2 = sharedX(p.get_value() * 0.) if gather: optim_params.append(mem) optim_params.append(g) optim_params.append(g2) r_t = 1. / (mem + 1) g_t = (1 - r_t) * g + r_t * grad g2_t = (1 - r_t) * g2 + r_t * grad**2 p_t = p - grad * T.minimum(lrate, g_t * g_t / (g2_t + eps)) / \ (T.sqrt(g2_t + eps) + eps) mem_t = 1 + mem * (1 - g_t * g_t / (g2_t + eps)) updates.append((g, g_t)) updates.append((g2, g2_t)) updates.append((p, p_t)) updates.append((mem, mem_t)) return updates
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0] ), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor( _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def __init__(self, inputs, input_size, output_size, is_backward=False, parameters=None): if parameters is None: self.W_if = U.create_shared(U.initial_weights(input_size, output_size), name='W_if') self.W_ff = U.create_shared(U.initial_weights(output_size, output_size), name='W_ff') self.b = U.create_shared(U.initial_weights(output_size), name='b') else: self.W_if = theano.shared(parameters['W_if'], name='W_if') self.W_ff = theano.shared(parameters['W_ff'], name='W_ff') self.b = theano.shared(parameters['b'], name='b') initial = T.zeros((output_size,)) self.is_backward = is_backward self.activation_fn = lambda x: T.cast(T.minimum(x * (x > 0), 20), dtype='float32')#dtype=theano.config.floatX) nonrecurrent = T.dot(inputs, self.W_if) + self.b self.output, _ = theano.scan( lambda in_t, out_tminus1, weights: self.activation_fn(in_t + T.dot(out_tminus1, weights)), sequences=[nonrecurrent], outputs_info=[initial], non_sequences=[self.W_ff], go_backwards=self.is_backward ) self.params = [self.W_if, self.W_ff, self.b]
def _differentiate(self, params=None): '''Return a sequence of gradients for our parameters. If this optimizer has been configured with a gradient norm limit, or with elementwise gradient clipping, this method applies the appropriate rescaling and clipping operations before returning the gradient. Parameters ---------- params : list of Theano variables, optional Return the gradient with respect to these parameters. Defaults to all parameters that the optimizer knows about. Yields ------ pairs : (param, grad) tuples Generates a sequence of tuples representing each of the parameters requested and the corresponding Theano gradient expressions. ''' if params is None: params = self._params for param, grad in zip(params, TT.grad(self._loss, params)): if self.max_gradient_elem > 0: limit = util.as_float(self.max_gradient_elem) yield param, TT.clip(grad, -limit, limit) elif self.max_gradient_norm > 0: norm = TT.sqrt((grad * grad).sum()) limit = util.as_float(self.max_gradient_norm) yield param, grad * TT.minimum(1, limit / norm) else: yield param, grad
def get_constraint_updates(self): constraint_updates = OrderedDict() if self.flags['scalar_lambd']: constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd) # constraint filters to have unit norm if self.flags['wv_norm'] in ('unit', 'max_unit'): wv = constraint_updates.get(self.Wv, self.Wv) wv_norm = T.sqrt(T.sum(wv**2, axis=0)) if self.flags['wv_norm'] == 'unit': constraint_updates[self.Wv] = wv / wv_norm elif self.flags['wv_norm'] == 'max_unit': constraint_updates[self.Wv] = wv / wv_norm * T.minimum(wv_norm, 1.0) constraint_updates[self.scalar_norms] = T.maximum(1.0, self.scalar_norms) ## clip parameters to maximum values (if applicable) for (k,v) in self.clip_max.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(param, param, v) ## clip parameters to minimum values (if applicable) for (k,v) in self.clip_min.iteritems(): assert k in [param.name for param in self.params()] param = constraint_updates.get(k, getattr(self, k)) constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param) return constraint_updates
def irprop_minus_updates(params, grads): # IRPROP- parameters updates = [] deltas = 0.1*numpy.ones(len(params)) last_params = params positiveStep = 1.2 negativeStep = 0.5 maxStep = 1. minStep = math.exp(-6) for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params): # calculate change change = T.sgn(gparam * last_gparam) if T.gt(change, 0) : delta = T.minimum(delta * positiveStep, maxStep) elif T.lt(change, 0): delta = T.maximum(delta * negativeStep, minStep) last_gparam = 0 # update the weights updates.append((param, param - T.sgn(gparam) * delta)) # store old change last_gparam = gparam return updates
def softmax(self, D, I): D = D * T.constant(self.attrs['sharpening'], 'float32') if self.attrs['norm'] == 'exp': E = T.exp(-D) * I E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32')) elif self.attrs['norm'] == 'sigmoid': E = (numpy.float32(1) - T.tanh(D)**2) * I elif self.attrs['norm'] == 'lstm': n_out = self.attrs['template'] def lstm(z, i_t, s_p, h_p): z += T.dot(h_p, self.N_re) i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out)) ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out]) forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out]) outgate = T.nnet.sigmoid(z[:,3 * n_out:]) input = T.tanh(z[:,:n_out]) s_t = input * ingate + s_p * forgetgate h_t = T.tanh(s_t) * outgate return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')]) E = T.nnet.sigmoid(T.dot(E,self.N_out)) else: raise NotImplementedError() if self.attrs['nbest'] > 1: opt = T.minimum(self.attrs['nbest'], E.shape[0]) score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0) E = T.switch(T.lt(E,score), T.zeros_like(E), E) return E
def prepare(): X = T.fmatrix('X') y = T.ivector('y') assert not ("regression" in args and "logistic" in args) if "regression" in args: output_layer = squared_error_net_adaptive() else: output_layer = logistic() all_params = lasagne.layers.get_all_params(output_layer) if "regression" in args: prob_vector = lasagne.layers.get_output(output_layer, X) loss = squared_error(prob_vector, y).mean() pred = T.maximum(0, T.minimum( T.round(prob_vector), args["num_classes"]-1 ) ) accuracy = T.mean( T.eq( pred, y ) ) else: a = args["a"] b = args["b"] loss_fn = get_hybrid_loss(a,b) prob_vector = lasagne.layers.get_output(output_layer, X) loss = loss_fn(prob_vector, y).mean() pred = T.argmax( prob_vector, axis=1 ) accuracy = T.mean( T.eq(pred,y) ) return Container( { "X": X, "y": y, "output_layer": output_layer, "all_params": all_params, "loss": loss, "pred": pred, "accuracy": accuracy, "prob_vector": prob_vector } )
def attend(self, y_p): updates = self.default_updates() for g in range(self.attrs['glimpse']): for i in range(len(self.base)-1,-1,-1): factor = T.constant(self.base[i].attrs['factor'][0], 'int32') if i > 0 else 1 B, C, I, h_p, _ = self.get(y_p, i, g) if i == len(self.base) - 1: z_i = self.distance(C, h_p) else: length = T.cast(T.max(T.sum(I,axis=0))+1,'int32') ext = T.cast(T.minimum(ext/factor,T.min(length)),'int32') def pick(i_t, ext): pad = T.minimum(i_t+ext, B.shape[0]) - ext return T.concatenate([T.zeros((pad,), 'int8'), T.ones((ext,), 'int8'), T.zeros((B.shape[0]-pad-ext+1,), 'int8')], axis=0) idx, _ = theano.map(pick, sequences = [pos/factor], non_sequences = [ext]) idx = (idx.dimshuffle(1,0)[:-1].flatten() > 0).nonzero() C = C.reshape((C.shape[0]*C.shape[1],C.shape[2]))[idx].reshape((ext,C.shape[1],C.shape[2])) z_i = self.distance(C, h_p) I = I.reshape((I.shape[0]*I.shape[1],))[idx].reshape((ext,I.shape[1])) if i > 0: pos = T.argmax(self.softmax(z_i,I),axis=0) * factor ext = factor else: w_i = self.softmax(z_i,I) B = B.reshape((B.shape[0]*B.shape[1],B.shape[2]))[idx].reshape((ext,B.shape[1],B.shape[2])) proto = T.sum(B * w_i.dimshuffle(0,1,'x').repeat(B.shape[2],axis=2),axis=0) for i in range(len(self.base)): self.glimpses[i].append(proto) return T.dot(proto, self.custom_vars['W_att_in_0']), updates
def _build_activation(self, act=None): '''Given an activation description, return a callable that implements it. ''' def compose(a, b): c = lambda z: b(a(z)) c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__) return c act = act or self.args.activation.lower() if '+' in act: return reduce(compose, (self._build_activation(a) for a in act.split('+'))) options = { 'tanh': TT.tanh, 'linear': lambda z: z, 'logistic': TT.nnet.sigmoid, 'softplus': TT.nnet.softplus, # shorthands 'relu': lambda z: TT.maximum(0, z), # modifiers 'rect:max': lambda z: TT.minimum(1, z), 'rect:min': lambda z: TT.maximum(0, z), # normalization 'norm:dc': lambda z: (z.T - z.mean(axis=1)).T, 'norm:max': lambda z: (z.T / TT.maximum(1e-10, abs(z).max(axis=1))).T, 'norm:std': lambda z: (z.T / TT.maximum(1e-10, TT.std(z, axis=1))).T, } for k, v in options.iteritems(): v.__theanets_name__ = k try: return options[act] except: raise KeyError('unknown --activation %s' % act)
def ready(self): # input (where first dimension is time) self.x = T.matrix() # target (where first dimension is time) if self.output_type == 'real': self.y = T.matrix(name='y', dtype=theano.config.floatX) else: raise NotImplementedError # initial hidden state of the RNN self.h0 = T.vector() # learning rate self.lr = T.scalar() if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError self.rnn = RNN(input=self.x, n_in=self.n_in, n_hidden=self.n_hidden, n_out=self.n_out, activation=activation, output_type=self.output_type, use_symbolic_softmax=self.use_symbolic_softmax) if self.output_type == 'real': self.predict = theano.function(inputs=[self.x, ], outputs=self.rnn.y_pred, mode=mode) else: raise NotImplementedError
def infer_shape(self, node, shapes): in_shape, = shapes dim1 = in_shape[self.axis1] dim2 = in_shape[self.axis2] out_shape = [ d for i, d in enumerate(in_shape) if i not in (self.axis1, self.axis2) ] # The following logic is inspired by C code of PyArray_Diagonal(). offset = self.offset if offset > 0: diag_size = T.clip(dim2 - offset, 0, dim1) elif offset < 0: diag_size = T.clip(dim1 + offset, 0, dim2) else: diag_size = T.minimum(dim1, dim2) out_shape.append(diag_size) return [tuple(out_shape)]
def learning_updates(self): for param, grad in zip(self.params, self.clipped_gradients()): grad_tm1 = self.shared_like(param, 'grad') step_tm1 = self.shared_like(param, 'step', self.learning_rate.value) test = grad * grad_tm1 same = TT.gt(test, 0) diff = TT.lt(test, 0) step = TT.minimum( self.max_step, TT.maximum( self.min_step, step_tm1 * (TT.eq(test, 0) + same * self.step_increase + diff * self.step_decrease))) grad = grad - diff * grad yield param, param - TT.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
def compute_steps(self, previous_steps): # if not hasattr(self, 'threshold'): # return previous_steps adapt_steps_up = self.adapt_steps + 1.0 # This will quickly converge the estimate for the mean cut_rho_mean = tensor.minimum(self.decay, self.adapt_steps / adapt_steps_up) if self.quick_variance_convergence: cut_rho_mean2 = cut_rho_mean else: cut_rho_mean2 = self.decay gnorm = l2_norm(previous_steps.values()) gnorm_log = tensor.log(l2_norm(previous_steps.values())) # here we quiclky converge the mean gnorm_log_ave_up = (cut_rho_mean * self.gnorm_log_ave + (1. - cut_rho_mean) * gnorm_log) # this can wait as it starts from 0 anyways! gnorm_log2_ave_up = (cut_rho_mean2 * self.gnorm_log2_ave + (1. - cut_rho_mean2) * (gnorm_log**2)) clip_threshold_up = tensor.exp(gnorm_log_ave_up + tensor.sqrt( tensor.maximum(0.0, gnorm_log2_ave_up - gnorm_log_ave_up**2)) * self.stdevs) if self.clip_to_mean: clip_level_up = tensor.exp(gnorm_log_ave_up) else: clip_level_up = clip_threshold_up multiplier = tensor.switch(gnorm < clip_threshold_up, 1, clip_level_up / gnorm) steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items()) return steps, [(self.adapt_steps, adapt_steps_up), (self.gnorm_log_ave, gnorm_log_ave_up), (self.gnorm_log2_ave, gnorm_log2_ave_up), (self.clip_threshold, clip_threshold_up), (self.clip_level, clip_level_up)]
def queue_transform(feature_strengths, feature_vects, return_strengths=False): """ Process features according to a "fragmented queue", where each timestep gets a size-1 window onto a feature queue. Effectively, feature_strengths gives how much to push onto queue feature_vects gives what to push on pop weights are tied to feature_strengths output is a size-1 peek (without popping) Parameters: - feature_strengths: float32 tensor of shape (batch, push_timestep) in [0,1] - feature_vects: float32 tensor of shape (batch, push_timestep, feature_dim) Returns: - peek_vects: float32 tensor of shape (batch, timestep, feature_dim) """ n_batch, n_time, n_feature = feature_vects.shape cum_sum_str = T.extra_ops.cumsum(feature_strengths, 1) # We will be working in (batch, timestep, push_timestep) # For each timestep, if we subtract out the sum of pushes before that timestep # and then cap to 0-1 we get the cumsums for just the features active in that # timestep timestep_adjustments = T.shape_padright(cum_sum_str - feature_strengths) push_time_cumsum = T.shape_padaxis(cum_sum_str, 1) relative_cumsum = push_time_cumsum - timestep_adjustments capped_cumsum = T.minimum(T.maximum(relative_cumsum, 0), 1) # Now we can recover the peek strengths by taking a diff shifted = T.concatenate( [T.zeros((n_batch, n_time, 1)), capped_cumsum[:, :, :-1]], 2) peek_strengths = capped_cumsum - shifted # Peek strengths is now (batch, timestep, push_timestep) result = T.batched_dot(peek_strengths, feature_vects) if return_strengths: return peek_strengths, result else: return result
def parse_transfer_function(string_identifier, slope_parameter=None): """ This function returns the appropriate activation function, as selected by the string argument. string_identifier: possible values are tanh, ReLU/relu, sigmoid/sig, abs, maxout <number>, linear/lin RETURNS: transfer_function(python/theano function), string_identifier (normalized), dict (for special cases) """ cross_channel_pooling_groups = None if string_identifier == 'tanh': Activation_f = T.tanh elif string_identifier in ['ReLU', 'relu']: #rectified linear unit string_identifier = "relu" Activation_f = lambda x: x * (x > 0) elif string_identifier in ['sigmoid', 'sig']: string_identifier = "sigmoid" Activation_f = T.nnet.sigmoid elif string_identifier in ['abs', 'Abs', 'absolute']: string_identifier = 'abs' Activation_f = T.abs_ elif string_identifier in ['plu', 'PLu', 'PLU', 'piecewise']: #piece-wise linear function string_identifier = "PLU" print( "parse_transfer_function::Remember to optimize the 'slope_parameter'" ) assert slope_parameter is not None, "...and better pass it to this function, as well! (type: Theano.Tensor, shape: same as activation, unif. random values [-1,1] should be fine)" Activation_f = lambda x: T.maximum(0, x) + T.minimum( 0, x) * slope_parameter elif "maxout" in string_identifier: r = int(string_identifier.split(" ")[1]) assert r >= 2 cross_channel_pooling_groups = r elif string_identifier in ['linear', "lin"]: string_identifier = "linear" Activation_f = lambda x: x else: raise NotImplementedError() return Activation_f, string_identifier, { "cross_channel_pooling_groups": cross_channel_pooling_groups }
def init_fbcorr(self, x, x_shp, n_filters, filter_shape, min_out=fbcorr_.DEFAULT_MIN_OUT, max_out=fbcorr_.DEFAULT_MAX_OUT, stride=fbcorr_.DEFAULT_STRIDE, mode=fbcorr_.DEFAULT_MODE, generate=None): # Reference implementation: # ../pythor3/pythor3/operation/fbcorr_/plugins/scipy_naive/scipy_naive.py if stride != fbcorr_.DEFAULT_STRIDE: raise NotImplementedError('stride is not used in reference impl.') fake_x = np.empty((x_shp[2], x_shp[3], x_shp[1]), x.dtype) kerns = self.SLMP._get_filterbank(fake_x, dict(n_filters=n_filters, filter_shape=filter_shape, generate=generate)) kerns = kerns.transpose(0, 3, 1, 2).copy()[:,:,::-1,::-1] x = conv.conv2d( x, kerns, image_shape=x_shp, filter_shape=kerns.shape, border_mode=mode) if mode == 'valid': x_shp = (x_shp[0], n_filters, x_shp[2] - filter_shape[0] + 1, x_shp[3] - filter_shape[1] + 1) elif mode == 'full': x_shp = (x_shp[0], n_filters, x_shp[2] + filter_shape[0] - 1, x_shp[3] + filter_shape[1] - 1) else: raise NotImplementedError('fbcorr mode', mode) if min_out is None and max_out is None: return x, x_shp elif min_out is None: return tensor.minimum(x, max_out), x_shp elif max_out is None: return tensor.maximum(x, min_out), x_shp else: return tensor.clip(x, min_out, max_out), x_shp
def post_modify_updates(self, updates, model): if hasattr(model, 'W'): W = model.W else: if not hasattr(model, 'transformer'): raise TypeError("model has neither 'W' nor 'transformer'.") transformer = model.transformer params = transformer.get_params() if len(params) != 1: raise TypeError("self.transformer does not have exactly one " "parameter tensor.") W, = params if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.square(updated_W).sum(axis=0)) desired_norms = T.minimum(col_norms, self.limit) scale = desired_norms / T.maximum(1e-7, col_norms) updates[W] = updated_W * scale
def __init__(self, inputs, input_size, output_size, is_backward=False): W_if = U.create_shared(U.initial_weights(input_size, output_size)) W_ff = U.create_shared(U.initial_weights(output_size, output_size)) b = U.create_shared(U.initial_weights(output_size)) initial = U.create_shared(U.initial_weights(output_size)) self.activation_fn = lambda x: T.minimum(x * (x > 0), 20) self.output, _ = theano.scan( lambda in_t: theano.scan( lambda index, out_tminus1: self.activation_fn( T.dot(out_tminus1, W_ff) + T.dot(in_t[index], W_if) + b), sequences=[T.arange(inputs.shape[1])], outputs_info=[initial], go_backwards=is_backward), sequences=[inputs] # for each sample at time "t" ) self.params = [W_if, W_ff, b]
def get_activation(act=None): def compose(a, b): c = lambda z: b(a(z)) c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__) return c if '+' in act: return functools.reduce(compose, (get_activation(a) for a in act.split('+'))) options = { 'tanh': T.tanh, 'linear': lambda z: z, 'logistic': T.nnet.sigmoid, 'sigmoid': T.nnet.sigmoid, 'hard_sigmoid': T.nnet.hard_sigmoid, 'softplus': T.nnet.softplus, 'softmax': softmax, 'theano_softmax': T.nnet.softmax, # shorthands 'relu': lambda z: T.nnet.relu(z), 'leaky_relu': lambda z: T.nnet.relu(z, 0.01), 'trel': lambda z: z * (z > 0) * (z < 1), 'trec': lambda z: z * (z > 1), 'tlin': lambda z: z * (abs(z) > 1), # modifiers 'rect:max': lambda z: T.minimum(1, z), 'rect:min': lambda z: T.maximum(0, z), # normalization 'norm:dc': lambda z: (z.T - z.mean(axis=1)).T, 'norm:max': lambda z: (z.T / T.maximum(1e-10, abs(z).max(axis=1))).T, 'norm:std': lambda z: (z.T / T.maximum(1e-10, T.std(z, axis=1))).T, } for k, v in options.items(): v.__theanets_name__ = k try: return options[act] except KeyError: raise KeyError('unknown activation %r' % act)
def sgd(loss, params, learning_rate, clip_at=5.0, scale_norm=0.0): updates = OrderedDict() grads = T.grad(cost=loss, wrt=params) epsilon = 1e-8 for p, g in zip(params, grads): # if clip_at > 0.0: # grad = clip(g, clip_at) # else: # grad = g # # if scale_norm > 0.0: # grad = scale(grad, scale_norm) grad_norm = g.norm(L=2) grad = (T.minimum(clip_at, grad_norm) / (grad_norm + epsilon)) * g updates[p] = p - learning_rate * grad return updates, grads
def __init__(self, n_in=5, hidden_stride=[50], n_out=5, learning_rate=0.01, L1_reg=0.00, L2_reg=0.00, learning_rate_decay=1, activation='tanh', final_momentum=0.9, initial_momentum=0.5, momentum_switchover=5): self.n_in = int(n_in) self.n_out = int(n_out) self.learning_rate = float(learning_rate) self.learning_rate_decay = float(learning_rate_decay) self.activation = activation self.initial_momentum = float(initial_momentum) self.final_momentum = float(final_momentum) self.momentum_switchover = int(momentum_switchover) if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError ###################### # BUILD ACTUAL MODEL # ###################### logger.info('... building the model') self.rnn = RNN(n_in, hidden_stride, n_out, activation=activation, L1_reg=L1_reg, L2_reg=L2_reg) self.stride_cnt = len(hidden_stride)
def sample_session_batch(self, max_n_samples, replace=False, selector_dtype='int32'): """ returns SessionBatchEnvironment with sessions(observations,actions,rewards) that will be sampled uniformly from this session pool. if replace=False, the amount of samples is min(max_n_sample, current pool) Otherwise it equals max_n_samples The chosen session ids will be sampled at random using self.rng on each iteration p.s. no need to propagate rng updates! It does so by itself. Unless you are calling it inside theano.scan, ofc, but i'd recomment that you didn't. unroll_scan works ~probably~ perfectly fine btw """ if replace: n_samples = max_n_samples else: n_samples = T.minimum(max_n_samples, self.pool_size) sample_ids = self.rng.choice(size=(n_samples,), a=self.pool_size, dtype=selector_dtype, replace=replace) return self.select_session_batch(sample_ids)
def init_opt(self): obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = ext.new_tensor( 'advantage', ndim=1, dtype=theano.config.floatX ) mean_var = ext.new_tensor( 'mean', ndim=2, dtype=theano.config.floatX ) log_std_var = ext.new_tensor( 'log_std', ndim=2, dtype=theano.config.floatX ) old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var) dist_info_vars = self.policy.dist_info_sym(obs_var) lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) surr_loss_vector = TT.minimum(lr * advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var) surr_loss = -TT.mean(surr_loss_vector) input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var] self.optimizer.update_opt( loss=surr_loss, target=self.policy, inputs=input_list ) return dict()
def learning_rate_updates(self, gradients): """ Compute a dictionary of shared variable updates related to annealing the learning rate. Returns ------- updates : dict A dictionary with the shared variables representing SGD metadata as keys and a symbolic expression of how they are to be updated as values. """ ups = {} if self.use_adagrad: learn_rates = [] for param, gp in zip(self.params, gradients): acc = self.accumulators[param] ups[acc] = acc + (gp**2).sum() learn_rates.append(self.e0s[param] / (ups[acc]**.5)) else: # Annealing coefficient. Here we're using a formula of # min(base_lr, anneal_start / (iteration + 1)) if self.anneal_start is None: annealed = sharedX(self.base_lr) else: frac = self.anneal_start / (self.iteration + 1.) annealed = tensor.minimum( as_floatX(frac), self.base_lr # maximum learning rate ) # Update the shared variable for the annealed learning rate. ups[self.annealed] = annealed ups[self.iteration] = self.iteration + 1 # Calculate the learning rates for each parameter, in the order # they appear in self.params learn_rates = [ annealed * self.learning_rates[p] for p in self.params ] return ups, learn_rates
def _new_update_deltas(self, network, parameter_vws, grads): learning_rate = network.find_hyperparameter(["learning_rate"], 0.001) epsilon = network.find_hyperparameter(["epsilon"], 1e-16) update_deltas = treeano.UpdateDeltas() for parameter_vw, grad in zip(parameter_vws, grads): mem_vw = network.create_vw( "smorms3_mem(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[treeano.inits.ConstantInit(1)], ) g_vw = network.create_vw( "smorms3_g(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) g2_vw = network.create_vw( "smorms3_g2(%s)" % parameter_vw.name, shape=parameter_vw.shape, is_shared=True, tags={"state"}, default_inits=[], ) parameter = parameter_vw.variable mem = mem_vw.variable g = g_vw.variable g2 = g2_vw.variable r = 1 / (mem + 1) new_g = (1 - r) * g + r * grad new_g2 = (1 - r) * g2 + r * grad**2 term1 = (new_g**2) / (new_g2 + epsilon) term2 = T.sqrt(new_g2) + epsilon parameter_delta = -grad * T.minimum(learning_rate, term1) / term2 new_mem = 1 + mem * (1 - term1) update_deltas[parameter] = parameter_delta update_deltas[mem] = new_mem - mem update_deltas[g] = new_g - g update_deltas[g2] = new_g2 - g2 return update_deltas
def ready(self): """ this routine is called from "fit" since we determine the image size (assumed square) and output labels from the training data. """ #input self.x = T.matrix('x') #output (a label) self.y = T.ivector('y') if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError self.cnn = CNN(input=self.x, n_in=self.n_in, n_out=self.n_out, activation=activation, nkerns=self.nkerns, filters=self.filters, n_hidden=self.n_hidden, poolsize=self.poolsize, output_type=self.output_type, batch_size=self.batch_size, use_symbolic_softmax=self.use_symbolic_softmax) #self.cnn.predict expects batch_size number of inputs. #we wrap those functions and pad as necessary in 'def predict' and 'def predict_proba' self.predict_wrap = theano.function(inputs=[self.x], outputs=self.cnn.y_pred, mode=mode) self.predict_proba_wrap = theano.function(inputs=[self.x], outputs=self.cnn.p_y_given_x, mode=mode)
def minimum(var1, var2, name=None): """Compute elementwise min among tensors Parameters ---------- var1, var2: Tensor Tensor to compare. Either one has to be :class:`luchador.nn.theano.wrapper.Tensor` class name : str Name of new Tensor Returns ------- Tensor The resulting Tensor """ # TODO: Add Broadcasting _tensor = T.minimum(var1.unwrap(), var2.unwrap()) return Tensor(tensor=_tensor, shape=var1.shape, name=name)
def optimizer(self): if not hasattr(self, '_optimizer'): df = self.fvector('A') - self.fvector('B') phi = df / (1 + tn.relu(df.norm(2) - 1)) y = tt.dot(self.samples, phi) p = tt.sum(tt.switch(y < 0, 1., 0.)) q = tt.sum(tt.switch(y > 0, 1., 0.)) if not hasattr(self, 'avg_case'): obj = tt.minimum(tt.sum(1. - tt.exp(-tn.relu(y))), tt.sum(1. - tt.exp(-tn.relu(-y)))) else: obj = p * tt.sum(1. - tt.exp(-tn.relu(y))) + q * tt.sum( 1. - tt.exp(-tn.relu(-y))) variables = [self.x0] for robot in self.robots: variables += [robot.x[0]] + robot.u for human in self.human.values(): variables += human.u self._optimizer = Maximizer(obj, variables) return self._optimizer
def cat_entropy(arr): """Return the entropy of categorical distributions described by the rows in ``arr``. Parameters ---------- arr : Theano variable Array of shape ``(n, d)`` describing ``n`` different categorical variables. Rows need to sum up to ``1`` and be non-negative. Returns ------- res : theano variable Has shape ``(n,)``. """ # TODO check if this is also valid for multinomial. arr = T.minimum(1, arr + 1e-8) return -(arr * T.log(arr)).sum(axis=1)
def relevance_pool(out_relevances, inputs, pool_size, pool_stride): # channels x channels x pool_0 x pool_1 pool_ones_shape = [out_relevances.shape[1], out_relevances.shape[1], pool_size[0], pool_size[1]] # modification: make inputs positive #inputs = T.abs_(inputs) # other variant: make inputs positive by offset offset = T.minimum(0, T.min(inputs, axis=(1,2,3), keepdims=True)) inputs = inputs - offset pool_ones = T.ones(pool_ones_shape, dtype=np.float32) # only within a channel spread values of that channel... # therefore set all values of indices like # filt_i, channel_j with j!=i to zero! pool_ones = pool_ones * T.eye(out_relevances.shape[1], out_relevances.shape[1]).dimshuffle( 0,1,'x','x') norms_for_relevances = conv2d(inputs, pool_ones, subsample=pool_stride, border_mode='valid') # prevent division by 0... # the relevance which had norm zero will not be redistributed anyways.. # so it doesn't matter which normalization factor you choose here, # only thing is to prevent NaNs... # however this means heatmapping is no longer completely preserving # norms_for_relevances += T.eq(norms_for_relevances, 0) * 1 normed_relevances = out_relevances / norms_for_relevances # stride has to be taken into account, see # http://stackoverflow.com/a/28752057/1469195 upsampled_relevances = T.zeros((normed_relevances.shape[0], normed_relevances.shape[1], normed_relevances.shape[2] * pool_stride[0] - pool_stride[0] + 1, normed_relevances.shape[3] * pool_stride[1] - pool_stride[1] + 1, ), dtype=np.float32) upsampled_relevances = T.set_subtensor( upsampled_relevances[:, :, ::pool_stride[0], ::pool_stride[1]], normed_relevances) in_relevances = conv2d(upsampled_relevances, pool_ones, subsample=(1,1), border_mode='full') in_relevances = in_relevances * inputs return in_relevances
def relevance_conv(out_relevances, inputs, weights, rule, bias=None, min_in=None, max_in=None, a=None, b=None): assert rule in ['w_sqr', 'z', 'z_plus', 'z_b', 'adapt_z_b', 'sign_stable', 'a_b', 'a_b_sign_switch', 'a_b_abs', 'a_b_in_plus'] if rule == 'w_sqr': return relevance_conv_w_sqr(out_relevances, weights, bias=bias) elif rule == 'z_plus': return relevance_conv_z_plus(out_relevances, inputs, weights, bias=bias) elif rule == 'z_b': assert min_in is not None assert max_in is not None assert min_in <= 0 assert max_in >= 0 return relevance_conv_z_b(out_relevances, inputs, weights, min_in, max_in, bias=bias) elif rule == 'adapt_z_b': # clip to zero both min and max to prevent mistakes... min_in = T.min(inputs) min_in = T.minimum(0, min_in) max_in = T.max(inputs) max_in = T.maximum(0, max_in) return relevance_conv_z_b(out_relevances, inputs, weights, min_in, max_in, bias=bias) elif rule == 'sign_stable': return relevance_conv_stable_sign(inputs, weights, out_relevances, bias=bias) elif rule == 'a_b': return relevance_conv_a_b(inputs, weights, out_relevances, a=a,b=b, bias=bias) elif rule == 'z': return relevance_conv_z(out_relevances, inputs, weights, bias=bias) elif rule == 'a_b_sign_switch': return relevance_conv_a_b_sign_switch(inputs, weights, out_relevances, a=a,b=b, bias=bias) elif rule == 'a_b_abs': return relevance_conv_a_b_abs(inputs, weights, out_relevances, a=a,b=b, bias=bias) elif rule == 'a_b_in_plus': return relevance_conv_a_b_in_plus(inputs, weights, out_relevances, a, b, bias)
def __init__(self, rng, input, batch_size, in_size, label_size, latent_size, label_fn, W_y=None, b_y=None, W_a=None, W_b=None): self.label_fn = label_fn # init parent class super(StickBreaking_Encoder_w_Labels, self).__init__(rng=rng, input=input, batch_size=batch_size, in_size=in_size, latent_size=latent_size, W_a=W_a, W_b=W_b) # setup label prediction params if W_y is None: W_values = np.asarray( 0.01 * rng.standard_normal(size=(in_size, label_size)), dtype=theano.config.floatX) W_y = theano.shared(value=W_values, name='W_y') if b_y is None: b_values = np.zeros((label_size, ), dtype=theano.config.floatX) b_y = theano.shared(value=b_values, name='b_y') self.W_y = W_y self.b_y = b_y # compute the label probabilities self.y_probs = self.label_fn(T.dot(self.input, self.W_y) + self.b_y) self.y_probs = T.maximum(T.minimum(self.y_probs, 1 - 1e-4), 1e-4) # Force 0 < output < 1 self.params += [self.W_y, self.b_y]
def build(self): """The PyMC model that incorporates Bayesian Statistics in order to store what the likelihood of the model is for a given point.""" M = pm.Model() with M: kfwd, endo, activeEndo, kRec, kDeg, sortF = commonTraf() nullRates = T.ones( 6, dtype=np.float64) # associated with IL2 and IL15 Tone = T.ones(1, dtype=np.float64) k27rev = pm.Lognormal("k27rev", mu=np.log(0.1), sigma=1, shape=1) # associated with IL7 k33rev = pm.Lognormal("k33rev", mu=np.log(0.1), sigma=1, shape=1) # associated with IL4 # constant according to measured number per cell. gc, blank, IL7R, blank, IL4R Rexpr = (np.array([0.0, 0.0, 328.0, 0.0, 2591.0, 0.0, 254.0, 0.0]) * endo) / (1.0 + ((kRec * (1.0 - sortF)) / (kDeg * sortF))) # indexing same as in model.hpp unkVec = T.concatenate( (kfwd, nullRates, k27rev, Tone, k33rev, Tone, endo, activeEndo, sortF, kRec, kDeg, Rexpr)) self.act.calc( unkVec, M ) # fitting the data based on act.calc for the given parameters if self.pretreat is True: Y_cross = self.cross.calc( unkVec) # fitting the data based on cross.calc pm.Deterministic("Y_cross", T.sum(T.square(Y_cross))) sd_cross = T.minimum(T.std(Y_cross), 0.1) pm.Normal( "fitD_cross", sigma=sd_cross, observed=Y_cross) # the stderr is definitely less than 0.2 # Save likelihood pm.Deterministic("logp", M.logpt) return M
def _step(self, xg_t, xo_t, xc_t, mask_tm1, h_tm1, c_tm1, u_g, u_o, u_c): h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 act = T.tensordot(xg_t + h_mask_tm1, u_g, [[1], [2]]) gate = T.nnet.softmax(act.reshape( (-1, act.shape[-1]))).reshape(act.shape) c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c)) ops = [ c_mask_tm1, c_tilda, (c_mask_tm1 + c_tilda), T.maximum(c_mask_tm1, c_tilda), T.minimum(c_mask_tm1, c_tilda), c_mask_tm1 - c_tilda, c_mask_tm1 * c_tilda, 0 * c_tilda, 0 * c_tilda + 1 ] yshuff = T.as_tensor_variable(ops, name='yshuff').dimshuffle(1, 2, 0) c_t = (gate.reshape((-1, gate.shape[-1])) * yshuff.reshape( (-1, yshuff.shape[-1]))).sum(axis=1).reshape(gate.shape[:2]) o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o)) h_t = o_t * self.activation(c_t) return h_t, c_t
def ready(self): # input (where first dimension is time) self.x = T.matrix() # target (where first dimension is time) if self.output_type == 'real': self.y = T.matrix(name='y', dtype=theano.config.floatX) else: raise NotImplementedError # initial hidden state of the RNN self.h0 = T.vector() # learning rate self.lr = T.scalar() if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError self.rnn = RNN(input=self.x, n_in=self.n_in, n_hidden=self.n_hidden, n_out=self.n_out, activation=activation, output_type=self.output_type, use_symbolic_softmax=self.use_symbolic_softmax) if self.output_type == 'real': self.predict = theano.function(inputs=[ self.x, ], outputs=self.rnn.y_pred, mode=mode) else: raise NotImplementedError
def ready(self): # input (where first dimension is time) self.x = T.tensor3(name='x') # target (where first dimension is time) self.y = T.tensor3(name='y', dtype=theano.config.floatX) # learning rate self.lr = T.scalar() if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError # generate numpy rng numpy_rng = np.random.RandomState(self.numpy_rng_seed) self.estimator = RNN(input=self.x, n_in=self.n_in, n_hidden=self.n_hidden, n_out=self.n_out, truncated_num=self.truncated_num, activation=activation, numpy_rng=numpy_rng) self.predict = theano.function(inputs=[self.x, ], outputs=self.estimator.y_pred, mode=mode) # get time stamp date_obj = datetime.datetime.now() date_str = date_obj.strftime('%Y%m%d-%H%M%S') self.timestamp = date_str # initialize errorlog self.errorlog = []
def input_row_from_variables(ori_ip, dest_ip, ori_lat, ori_long, dest_lat, dest_long, ori_type, dest_type, dist): '''Create an input row for the MLP from the inputs''' input_row = tensor.zeros([input_size]) offset = 0 ips = [ori_ip, dest_ip] for ip in ips: for _ in range(4): input_row = add_one_shot(input_row, offset, tensor.mod(ip, 256)) ip = tensor.int_div(ip, 256) offset += 256 for lat_, long_ in [(ori_lat, ori_long), (dest_lat, dest_long)]: translated_lat = tensor.iround( (coordinate_size - 1) * (lat_ / 180 + 0.5)) input_row = add_thermo(input_row, offset, translated_lat) offset += coordinate_size translated_long = tensor.iround( (coordinate_size - 1) * (long_ / 360 + 0.5)) input_row = add_thermo(input_row, offset, translated_long) offset += coordinate_size for type_ in [ori_type, dest_type]: add_one_shot(input_row, offset, type_ + 1) offset += type_size translated_dist = tensor.iround( (dest_size - 1) * (tensor.minimum(1, dist / max_earth_distance))) input_row = add_thermo(input_row, offset, translated_dist) #could be useful if we want to add something offset += dest_size return input_row
def prepare(): X = T.fmatrix('X') y = T.ivector('y') assert not ("regression" in args and "logistic" in args) if "regression" in args: output_layer = squared_error_net_adaptive() else: output_layer = logistic() all_params = lasagne.layers.get_all_params(output_layer) if "regression" in args: prob_vector = lasagne.layers.get_output(output_layer, X) loss = squared_error(prob_vector, y).mean() pred = T.maximum( 0, T.minimum(T.round(prob_vector), args["num_classes"] - 1)) accuracy = T.mean(T.eq(pred, y)) else: a = args["a"] b = args["b"] loss_fn = get_hybrid_loss(a, b) prob_vector = lasagne.layers.get_output(output_layer, X) loss = loss_fn(prob_vector, y).mean() pred = T.argmax(prob_vector, axis=1) accuracy = T.mean(T.eq(pred, y)) return Container({ "X": X, "y": y, "output_layer": output_layer, "all_params": all_params, "loss": loss, "pred": pred, "accuracy": accuracy, "prob_vector": prob_vector })
def relu_activation(x, leak_slope=0., clip_threshold=None, **kwargs): # Reference: # Nair, Vinod, and Geoffrey E. Hinton. "Rectified linear units improve restricted boltzmann machines." # In Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807-814. 2010. # # softplus in turn can be approximated by a simple max operation max(0, x + N(0, sigmoid(x))). The gaussian noise # component is added since softplus behaves like a noisy integer valued version of a smoothed rectified linear unit. # The variance of this noise is sigmoid(x) and does not becom large for large x. This can further be simplified by # usinf max(0,x) instead. This function is known as Rectified Linear (ReL). This has some advantages: # - No vanishing gradient at +inf, like softplus # - Induces sparsity in activations # - Empirical results indicate deep networks can be trained effectively with ReL units (ReLU) # - Can be used by RBMs to model real/integer valued inputs assert hasattr(T.nnet, 'relu'), ('It looks like like your version of ' 'Theano is out of date. ' 'Install the latest version with:\n' 'pip install git+git://github.com/Theano/Theano.git --upgrade --no-deps') assert leak_slope is not None, "Leak slope cannot be None" x = T.nnet.relu(x, leak_slope) if clip_threshold is not None: x = T.minimum(x, clip_threshold) return x
def FeedforwardBatchNormalization(x, gamma, mask, estimated_mean=0.0, estimated_var=1.0): assert x.ndim == 3 if mask: assert mask.ndim == 2 mask = mask.dimshuffle(0, 1, 'x') mask_nonzeros = T.sum(T.sum(mask, axis=0), axis=0) mask_nonzeros_weight = T.cast( T.minimum(1.0, T.sum(mask, axis=0)) / mask.shape[1], 'float32') x_masked = x * mask x_mean = (T.sum(T.sum(x_masked, axis=0), axis=0) / mask_nonzeros).dimshuffle('x', 'x', 0) ## why do we need mask_nonzeros_weight x_mean_adjusted = mask_nonzeros_weight * x_mean + ( 1.0 - mask_nonzeros_weight) * estimated_mean x_zero_mean = x - x_mean_adjusted x_var = (T.sum(T.sum(x_zero_mean**2, axis=0), axis=0) / mask_nonzeros).dimshuffle('x', 'x', 0) x_var_adjusted = mask_nonzeros_weight * x_var + ( 1.0 - mask_nonzeros_weight) * estimated_var else: x_mean = estimated_mean.dimshuffle('x', 'x', 0) x_mean_adjusted = x_mean x_zero_mean = x - x_mean x_var = estimated_var.dimshuffle('x', 'x', 0) x_var_adjusted = x_var return gamma * (x_zero_mean / T.sqrt(x_var_adjusted + 1e-7) ), x_mean_adjusted[0, 0], x_var_adjusted[0, 0]