def call(self, inputs, mask=None): if self.force_path: output = self._drop_path(inputs) else: output = K.in_train_phase(self._drop_path(inputs), self._ave(inputs)) return output
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, activation='linear'): '''Apply y.w + b for every temporal slice y of x. ''' activation = activations.get(activation) if not input_dim: # won't work with TensorFlow input_dim = K.shape(x)[2] if not timesteps: # won't work with TensorFlow timesteps = K.shape(x)[1] if not output_dim: # won't work with TensorFlow output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b: x = x + b # reshape to 3D tensor x = K.reshape(activation(x), (-1, timesteps, output_dim)) return x
def call(self, x, mask=None): input_shape = self.input_spec[0].shape reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] # case: train mode (uses stats of the current batch) mean = K.mean(x, axis=reduction_axes) brodcast_mean = K.reshape(mean, broadcast_shape) std = K.mean(K.square(x - brodcast_mean) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + (1 - self.momentum) * mean std_update = self.momentum * self.running_std + (1 - self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] x_normed = (x - brodcast_mean) / (brodcast_std + self.epsilon) # case: test mode (uses running averages) brodcast_running_mean = K.reshape(self.running_mean, broadcast_shape) brodcast_running_std = K.reshape(self.running_std, broadcast_shape) x_normed_running = ((x - brodcast_running_mean) / (brodcast_running_std + self.epsilon)) # pick the normalized form of x corresponding to the training phase x_normed = K.in_train_phase(x_normed, x_normed_running) out = K.reshape(self.gamma, broadcast_shape) * x_normed + K.reshape(self.beta, broadcast_shape) return out
def call(self, inputs, training=None): if self.is_mc_dropout: return self.layer.call(self.concrete_dropout(inputs)) else: def relaxed_dropped_inputs(): return self.layer.call(self.concrete_dropout(inputs)) return K.in_train_phase(relaxed_dropped_inputs, self.layer.call(inputs), training=training)
def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal_variable(shape=(params['batch_size'], params['hidden_dim']), mean=0., scale=1.) # insert kl loss here z_rand = z_mean + K.exp(z_log_var / 2) * kl_loss_var * epsilon return K.in_train_phase(z_rand, z_mean)
def call(self, inputs, training=None): if 0. < self.rate < 1.: noise_shape = self._get_noise_shape(inputs) def dropped_inputs(): return K.dropout(inputs, self.rate, noise_shape, seed=self.seed) return K.in_train_phase(dropped_inputs, inputs, training=training) #return tf.cond(tf.squeeze(self.dropoutEnabled) < tf.constant(1), lambda: inputs, lambda: dropped_inputs()) return inputs
def get_constants(self, x): constants = [] if 0 < self.dropout_U < 1: ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) ones = K.concatenate([ones] * self.output_dim, 1) B_U = K.in_train_phase(K.dropout(ones, self.dropout_U), ones) constants.append(B_U) else: constants.append(K.cast_to_floatx(1.)) if self.consume_less == 'cpu' and 0 < self.dropout_W < 1: input_shape = self.input_spec[0].shape input_dim = input_shape[-1] ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) ones = K.concatenate([ones] * input_dim, 1) B_W = K.in_train_phase(K.dropout(ones, self.dropout_W), ones) constants.append(B_W) else: constants.append(K.cast_to_floatx(1.)) return constants
def call(self, x, mask=None): if 0. < self.dropout < 1.: retain_p = 1. - self.dropout B = K.random_binomial((self.input_dim,), p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W out = K.gather(W, x) return out
def dot_product_attention(self, x, seq_len=None, dropout=0.1, training=None): q, k, v = x logits = tf.matmul(q, k, transpose_b=True) if self.bias: logits += self.b if seq_len is not None: logits = self.mask_logits(logits, seq_len) weights = tf.nn.softmax(logits, name="attention_weights") weights = K.in_train_phase(K.dropout(weights, dropout), weights, training=training) x = tf.matmul(weights, v) return x
def call(self, x, mask=None): if self.normalize: mean, std = self._get_mean_and_std(x) broadcast_shape = [1] * K.ndim(x) broadcast_shape[self.axis] = K.shape(x)[self.axis] broadcast_mean = K.reshape(mean, broadcast_shape) broadcast_std = K.reshape(std, broadcast_shape) return K.in_train_phase((x - broadcast_mean) / (broadcast_std + K.epsilon()), x) else: return x * 1.
def get_constants(self, x): constants = [] if 0 < self.dropout_U < 1: ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) ones = K.concatenate([ones] * self.output_dim, 1) B_U = [K.in_train_phase(K.dropout(ones, self.dropout_U), ones) for _ in range(3)] constants.append(B_U) else: constants.append([K.cast_to_floatx(1.) for _ in range(3)]) if 0 < self.dropout_W < 1: input_shape = self.input_spec[0].shape input_dim = input_shape[-1] ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) ones = K.concatenate([ones] * input_dim, 1) B_W = [K.in_train_phase(K.dropout(ones, self.dropout_W), ones) for _ in range(3)] constants.append(B_W) else: constants.append([K.cast_to_floatx(1.) for _ in range(3)]) return constants
def call(self, x, mask=None): if 0. < self.dropout < 1.: retain_p = 1. - self.dropout B = K.random_binomial((self.input_dim,), p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W W_ = T.concatenate([self.zeros_vector, W], axis=0) out = K.gather(W_, x) return out
def __call__(self, loss): if not hasattr(self, 'layer'): raise Exception('Need to call `set_layer` on ' 'ActivityRegularizer instance ' 'before calling the instance.') regularized_loss = loss for i in range(len(self.layer.inbound_nodes)): output = K.sigmoid(0.1 * self.layer.get_output_at(i)) #output = self.layer.get_output_at(i) p_hat = K.mean(K.abs(output)) regularized_loss += self.l * kl_divergence(self.p, p_hat) return K.in_train_phase(regularized_loss, loss)
def call(self, x, mask=None): if isinstance(x, list): x,_ = x if mask is not None and isinstance(mask, list): mask,_ = mask if 0. < self.dropout < 1.: retain_p = 1. - self.dropout dims = self.W._keras_shape[:-1] B = K.random_binomial(dims, p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W if self.mode == 'matrix': return K.gather(W,x) elif self.mode == 'tensor': # quick and dirty: only allowing for 3dim inputs when it's tensor mode assert K.ndim(x) == 3 # put sequence on first; gather; take diagonal across shared batch dimension # in other words, W is (B, S, F) # incoming x is (B, S, A) inds = K.arange(self.W._keras_shape[0]) #out = K.gather(K.permute_dimensions(W, (1,0,2)), x).diagonal(axis1=0, axis2=3) #return K.permute_dimensions(out, (3,0,1,2)) ### method above doesn't do grads =.= # tensor abc goes to bac, indexed onto with xyz, goes to xyzac, # x == a, so shape to xayzc == xxyzc # take diagonal on first two: xyzc #out = K.colgather() out = K.gather(K.permute_dimensions(W, (1,0,2)), x) out = K.permute_dimensions(out, (0,3,1,2,4)) out = K.gather(out, (inds, inds)) return out else: raise Exception('sanity check. should not be here.') #all_dims = T.arange(len(self.W._keras_shape)) #first_shuffle = [all_dims[self.embed_dim]] + all_dims[:self.embed_dim] + all_dims[self.embed_dim+1:] ## 1. take diagonal from 0th to ## chang eof tactics ## embed on time or embed on batch. that's all I'm supporting. ## if it's embed on time, then, x.ndim+1 is where batch will be, and is what ## i need to take the diagonal over. ## now dim shuffle the xdims + 1 to the front. #todo: get second shuffle or maybe find diagonal calculations #out = K.gather(W, x) #return out ### reference #A = S(np.arange(60).reshape(3,4,5)) #x = S(np.random.randint(0, 4, (3,4,10))) #x_emb = A.dimshuffle(1,0,2)[x].dimshuffle(0,3,1,2,4)[T.arange(A.shape[0]), T.arange(A.shape[0])]
def __call__(self, loss): if not hasattr(self, 'layer'): raise Exception('Need to call `set_layer` on ' 'MaskRegularizer instance ' 'before calling the instance.') min_tag_size = self.mask_size**2 * self.min_covered factor = min_tag_size / self.max_loss out = self.layer.output out_sum = out.sum(axis=(1, 2, 3)) reg_loss = K.switch(out_sum <= min_tag_size, factor*(out_sum - min_tag_size)**2, 0) return K.in_train_phase(loss + reg_loss.mean(), loss)
def __call__(self, loss): W = self.p WW = T.dot(W.T,W) dim1, dim2 = WW.shape.eval() #The number of neurons in the layer k = self.k o = np.ones(dim1) #initial values for the dominant eigenvector #POWER METHOD FOR APPROXIMATING THE DOMINANT EIGENVECTOR (9 ITERATIONS): domineigvec = T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,T.dot(WW,o))))))))) WWd = T.dot(WW,domineigvec) domineigval = T.dot(WWd,domineigvec)/T.dot(domineigvec,domineigvec) #THE CORRESPONDING DOMINANT EIGENVALUE regularized_loss = loss + (domineigval ** 0.5) * self.k #multiplied by the given regularization gain return K.in_train_phase(regularized_loss, loss)
def call(self, x, mask=None): if K.dtype(x) != 'int32': x = K.cast(x, 'int32') if 0. < self.dropout < 1.: retain_p = 1. - self.dropout B = K.random_binomial((self.input_dim,), p=retain_p) * (1. / retain_p) B = K.expand_dims(B) W = K.in_train_phase(self.W * B, self.W) else: W = self.W denorm = K.sum(W, axis=0) W = W / denorm out = K.gather(W, x) return out
def __call__(self, loss): if not hasattr(self, 'layer'): raise Exception('Need to call `set_layer` on ' 'LowFrequenciesRegularizer instance ' 'before calling the instance. ') regularized_loss = K.zeros_like(loss) print(self.layer.inbound_nodes) for i in range(len(self.layer.inbound_nodes)): print(i) out = self.layer.get_output_at(i) low_freq = gaussian_filter_2d(out, self.sigma) regularized_loss += K.sum(K.abs(low_freq)) * self.factor print(regularized_loss) return K.in_train_phase(loss + regularized_loss, loss)
def get_constants(self, inputs, training=None): constants = [] if 0. < self.recurrent_dropout < 1.: ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1))) ones = K.tile(ones, (1, self.units)) def dropped_inputs(): return K.dropout(ones, self.recurrent_dropout) rec_dp_mask = [K.in_train_phase(dropped_inputs, ones, training=training) for _ in range(3)] constants.append(rec_dp_mask) else: constants.append([K.cast_to_floatx(1.) for _ in range(3)]) return constants
def __call__(self, loss): power = 9 # number of iterations of the power method W = self.p WW = K.dot(K.transpose(W), W) dim1, dim2 = K.eval(K.shape(WW)) k = self.k o = np.ones(dim1) # initial values for the dominant eigenvector # power method for approximating the dominant eigenvector: domin_eigenvect = K.dot(WW, o) for n in range(power - 1): domin_eigenvect = K.dot(WW, domin_eigenvect) WWd = K.dot(WW, domin_eigenvect) domin_eigenval = K.dot(WWd, domin_eigenvect) / K.dot(domin_eigenvect, domin_eigenvect) # the corresponding dominant eigenvalue regularized_loss = loss + (domin_eigenval ** 0.5) * self.k # multiplied by the given regularization gain return K.in_train_phase(regularized_loss, loss)
def __call__(self, loss): if not hasattr(self, 'p'): raise Exception('Need to call `set_param` on ' 'WeightRegularizer instance ' 'before calling the instance. ' 'Check that you are not passing ' 'a WeightRegularizer instead of an ' 'ActivityRegularizer ' '(i.e. activity_regularizer="l2" instead ' 'of activity_regularizer="activity_l2".') regularized_loss = loss p = self.p if self.p_mask is not None: p = self.p * self.p_mask if self.l1: regularized_loss += K.mean(K.abs(p)) * self.l1 if self.l2: regularized_loss += K.mean(K.square(p)) * self.l2 return K.in_train_phase(regularized_loss, loss)
def time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def __call__(self, loss): if not hasattr(self, 'p'): raise Exception('Need to call `set_param` on ' 'WeightRegularizer instance ' 'before calling the instance. ' 'Check that you are not passing ' 'a WeightRegularizer instead of an ' 'ActivityRegularizer ' '(i.e. activity_regularizer="l2" instead ' 'of activity_regularizer="activity_l2".') regularized_loss = loss + K.sum(K.abs(self.p)) * self.l1 regularized_loss += K.sum(K.square(self.p)) * self.l2 # out_dim = self.p.shape.eval()[-1] diff_mat = np.eye(out_dim) - np.eye(out_dim, k=1) diff_mat[-1, -1] = 0 d = K.variable(diff_mat) regularized_loss += K.sum(K.square(K.dot(self.p, d))) * self.m return K.in_train_phase(regularized_loss, loss)
def call(self, x): # todo: only optionally apply sigmoid # todo: apply viterbi during inference x = Activation(K.sigmoid)(x) # using K.in_train_phase results in both if and else conditions being # computed, which in this case is very expensive. instead, tf.cond # is used. Even so, if and else conditions must be wrapped in a lambda # to ensure that they are not computed unless that path is chosen. if self.viterbi_inference: # include this in the graph so that keras knows that the learning phase # variable needs to be passed into tensorflows session run. x = K.in_train_phase(x, x) return Lambda(lambda x: tf.cond( K.learning_phase(), lambda: self.hmm.forward_backward(x)[0], lambda: self.hmm.viterbi_decode_batched(x, onehot=True)[0], ))(x) else: return Lambda(lambda x: self.hmm.forward_backward(x)[0])(x)
def call(self, x): if 0. < self.prob < 1.: self.layer.kernel = K.in_train_phase(K.dropout(self.layer.kernel, self.prob), self.layer.kernel) self.layer.bias = K.in_train_phase(K.dropout(self.layer.bias, self.prob), self.layer.bias) return self.layer.call(x)
def call(self, x, mask=None, training=None): x, residual = x pred = tf.random_uniform([]) < self.dropout x_train = tf.cond(pred, lambda: residual, lambda: tf.nn.dropout(x, 1.0 - self.dropout) + residual) x_test = x + residual return K.in_train_phase(x_train, x_test, training=training)
def get_yj_means(self): return K.transpose(K.in_train_phase(self.mj, self.mjr))
def call(self, inputs, training=None): nx = K.random_normal(K.shape(inputs)) return K.in_train_phase(inputs, nx)
def call(self, x, mask=None): if self.mode == 0 or self.mode == 2: assert self.built, 'Layer must be built before being called' input_shape = K.int_shape(x) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] mean_batch, var_batch = _moments(x, reduction_axes, shift=None, keep_dims=False) std_batch = (K.sqrt(var_batch + self.epsilon)) r_max_value = K.get_value(self.r_max) r = std_batch / (K.sqrt(self.running_std + self.epsilon)) r = K.stop_gradient(K.clip(r, 1 / r_max_value, r_max_value)) d_max_value = K.get_value(self.d_max) d = (mean_batch - self.running_mean) / K.sqrt(self.running_std + self.epsilon) d = K.stop_gradient(K.clip(d, -d_max_value, d_max_value)) if sorted(reduction_axes) == range(K.ndim(x))[:-1]: x_normed_batch = (x - mean_batch) / std_batch x_normed = (x_normed_batch * r + d) * self.gamma + self.beta else: # need broadcasting broadcast_mean = K.reshape(mean_batch, broadcast_shape) broadcast_std = K.reshape(std_batch, broadcast_shape) broadcast_r = K.reshape(r, broadcast_shape) broadcast_d = K.reshape(d, broadcast_shape) broadcast_beta = K.reshape(self.beta, broadcast_shape) broadcast_gamma = K.reshape(self.gamma, broadcast_shape) x_normed_batch = (x - broadcast_mean) / broadcast_std x_normed = (x_normed_batch * broadcast_r + broadcast_d) * broadcast_gamma + broadcast_beta # explicit update to moving mean and standard deviation self.add_update([ K.moving_average_update(self.running_mean, mean_batch, self.momentum), K.moving_average_update(self.running_std, std_batch**2, self.momentum) ], x) # update r_max and d_max r_val = self.r_max_value / ( 1 + (self.r_max_value - 1) * K.exp(-self.t)) d_val = self.d_max_value / (1 + ( (self.d_max_value / 1e-3) - 1) * K.exp(-(2 * self.t))) self.add_update([ K.update(self.r_max, r_val), K.update(self.d_max, d_val), K.update_add(self.t, K.variable(np.array([self.t_delta]))) ], x) if self.mode == 0: if sorted(reduction_axes) == range(K.ndim(x))[:-1]: x_normed_running = K.batch_normalization( x, self.running_mean, self.running_std, self.beta, self.gamma, epsilon=self.epsilon) else: # need broadcasting broadcast_running_mean = K.reshape(self.running_mean, broadcast_shape) broadcast_running_std = K.reshape(self.running_std, broadcast_shape) broadcast_beta = K.reshape(self.beta, broadcast_shape) broadcast_gamma = K.reshape(self.gamma, broadcast_shape) x_normed_running = K.batch_normalization( x, broadcast_running_mean, broadcast_running_std, broadcast_beta, broadcast_gamma, epsilon=self.epsilon) # pick the normalized form of x corresponding to the training phase # for batch renormalization, inference time remains same as batchnorm x_normed = K.in_train_phase(x_normed, x_normed_running) elif self.mode == 1: # sample-wise normalization m = K.mean(x, axis=self.axis, keepdims=True) std = K.sqrt( K.var(x, axis=self.axis, keepdims=True) + self.epsilon) x_normed_batch = (x - m) / (std + self.epsilon) r_max_value = K.get_value(self.r_max) r = std / (self.running_std + self.epsilon) r = K.stop_gradient(K.clip(r, 1 / r_max_value, r_max_value)) d_max_value = K.get_value(self.d_max) d = (m - self.running_mean) / (self.running_std + self.epsilon) d = K.stop_gradient(K.clip(d, -d_max_value, d_max_value)) x_normed = ((x_normed_batch * r) + d) * self.gamma + self.beta # update r_max and d_max t_val = K.get_value(self.t) r_val = self.r_max_value / ( 1 + (self.r_max_value - 1) * np.exp(-t_val)) d_val = self.d_max_value / (1 + ( (self.d_max_value / 1e-3) - 1) * np.exp(-(2 * t_val))) t_val += float(self.t_delta) self.add_update([ K.update(self.r_max, r_val), K.update(self.d_max, d_val), K.update(self.t, t_val) ], x) return x_normed
def step(self, h, states): ''' receives inputs for a time step @inp : h - [previous_layer_input, true_input_for_previous_timestep] at train time or [previous_layer_input, zeros] at test time @inp : states - a dictionary, contains the following - 'initial_states' - state vector - At train time, this includes the true input sequence for the given time step, in addition to the state for the previous time step. - At test time, - 'random_cutoff_prob' - random cutoff matrix used for sampling at test time - 'rec_dp_mask' - for use with dropout (not tested - may break) @return: output - raw output, unsampled @return: final_output - output that has been sampled in test case ''' ################ # Parsing the states vector ################ initial_states = states['initial_states'] random_cutoff_vec = states['random_cutoff_prob'] if self.recurrent_dropout > 0: rec_dp_mask = states['rec_dp_mask'] else: rec_dp_mask = np.array([1., 1., 1., 1.], dtype='float32') h_tm1 = initial_states[0][:1, :, :] def teacher_forced(h, states): # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] true_input = h[1:, :, :self.units] # this should correspond to true input prev_sampled_output = true_input if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] else: raise ValueError('Implementation type ' + self.implementation + ' is invalid') z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh return K.stack([output, output]) def free_running(h, states): prev_generated_output = initial_states[0][1:, :, :] prev_sampled_output = prev_generated_output # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh final_output = self.output_sampling(output, random_cutoff_vec) return K.stack([output, final_output]) output_2d_tensor = K.in_train_phase(teacher_forced(h, states), free_running(h, states)) output_2d_tensor = K.squeeze(output_2d_tensor, 1) return output_2d_tensor, [output_2d_tensor]
def call(self, inputs, mask=None, training=None, initial_state=None): # input shape: `(samples, time (padded with zeros), input_dim)` # note that the .build() method of subclasses MUST define # self.input_spec and self.state_spec with complete input shapes. # input for training [aux_softmax, ground thruth, dialogue act vector] input_length = K.int_shape(inputs[0])[1] input_list = inputs if self.semantic_condition and self.condition_on_ptm1 and self.generation_only: #takes orig_input while training and dialogue act for conditioning inputs = input_list[0] initial_state = self.get_initial_state(inputs) constants = self.get_constants(inputs, training=None) elif not self.generation_only and self.semantic_condition and self.condition_on_ptm1: #takes the aux the orig input -1 and the dialogue act while training, while testing the o-1 is replaced by ptm1 aux_inputs = concatenate(inputs=input_list[:2]) initial_state = self.get_initial_state(aux_inputs) constants = self.get_constants(aux_inputs, training=None) inputs = K.in_train_phase(aux_inputs, input_list[0]) elif not self.generation_only and not self.semantic_condition and self.condition_on_ptm1: # takes the aux the orig input -1 and the dialogue act while training, while testing the o-1 is replaced by ptm1 aux_inputs = concatenate(inputs=input_list[:2]) initial_state = self.get_initial_state(aux_inputs) constants = self.get_constants(aux_inputs, training=None) inputs = K.in_train_phase(aux_inputs, input_list[0]) elif not self.generation_only and not self.semantic_condition and not self.condition_on_ptm1: #takes aux input for train and testing (vanilla lstm) inputs = input_list initial_state = self.get_initial_state(inputs) constants = self.get_constants(inputs, training=None) else: inputs = input_list[0] initial_state = self.get_initial_state(inputs) constants = self.get_constants(inputs, training=None) if self.semantic_condition: dialogue_act = input_list[-1] initial_state = initial_state + [dialogue_act] sc_constants = self.get_sc_constants(dialogue_act, training=None) constants = constants + sc_constants if self.condition_on_ptm1: p0 = self.get_initial_p(inputs) initial_state += p0 if isinstance(mask, list): mask = mask[0] preprocessed_input = self.preprocess_input(inputs, training=None) rnn_output = sc_tf_rnn(self.step, preprocessed_input, initial_state, semantic_conditioning=self.semantic_condition, go_backwards=self.go_backwards, mask=mask, constants=constants) if self.semantic_condition: last_output, outputs, last_da, da_outputs, states = rnn_output else: last_output, outputs, states = rnn_output # Properly set learning phase if 0.0 < self.dropout + self.recurrent_dropout + self.sc_dropout: last_output._uses_learning_phase = True outputs._uses_learning_phase = True if self.return_sequences: output = outputs else: output = last_output if self.return_state: if not isinstance(states, (list, tuple)): states = [states] else: states = list(states) output = [output] + states if self.semantic_condition and self.return_da: output = [output, last_da, da_outputs] return output
def one_zero(x): return K.in_train_phase(K.zeros_like(x), K.ones_like(x))
def f(t): return K.in_train_phase(K.dot(t, ortho_weights), K.dot(t, ortho_weights_store))
def call(self, inputs): x = inputs[0] loga = inputs[1] return K.in_train_phase(self.call_training(loga, x), self.call_inference(loga, x))
def call(self, x, mask=None): if 0. < self.p <= 1.: x = K.in_train_phase(hybo_tf(x, p = self.p, shift = self.shift, unif = self.unif, just_dropout = self.just_dropout), x) return x
def PadSymmetricInTestPhase(): pad = Lambda(lambda x: K.in_train_phase( x, tf.pad(x, tf.constant([[0, 0], [2, 2], [2, 2], [0, 0]]), 'SYMMETRIC' ))) pad.uses_learning_phase = True return pad
def apply_dropout_if_needed(self, _input, training=None): def dropped_softmax(): return K.dropout(_input, 0.5) return K.in_train_phase(dropped_softmax, _input, training=training)
def build(self, input_shape): input_dim = input_shape[-1] self.kernel = self.add_weight(shape=(input_dim, self.units * 5), name='kernel', initializer='glorot_uniform') self.recurrent_kernel = self.add_weight(shape=(self.units, self.units * 5), name='recurrent_kernel', initializer='orthogonal') self.cell_kernel = self.add_weight(shape=(self.units, self.units * 5), name='cell_kernel', initializer='orthogonal') self.up_att_downl = self.add_weight(shape=(1, self.dk), name='up_att_downl', initializer='glorot_uniform') self.up_att_downr = self.add_weight(shape=(1, self.dk), name='up_att_downr', initializer='glorot_uniform') self.up_att_randl = self.add_weight(shape=(1, self.dk), name='up_att_randl', initializer='glorot_uniform') self.up_att_randr = self.add_weight(shape=(1, self.dk), name='up_att_randr', initializer='glorot_uniform') self.down_att_upl = self.add_weight(shape=(1, self.dk), name='down_att_upl', initializer='glorot_uniform') self.down_att_upr = self.add_weight(shape=(1, self.dk), name='down_att_upr', initializer='glorot_uniform') self.down_att_randl = self.add_weight(shape=(1, self.dk), name='down_att_randl', initializer='glorot_uniform') self.down_att_randr = self.add_weight(shape=(1, self.dk), name='down_att_randr', initializer='glorot_uniform') self.rand_att_upl = self.add_weight(shape=(1, self.dk), name='rand_att_upl', initializer='glorot_uniform') self.rand_att_upr = self.add_weight(shape=(1, self.dk), name='rand_att_upr', initializer='glorot_uniform') self.rand_att_downl = self.add_weight(shape=(1, self.dk), name='rand_att_downl', initializer='glorot_uniform') self.rand_att_downr = self.add_weight(shape=(1, self.dk), name='rand_att_downr', initializer='glorot_uniform') self.aggregation = self.add_weight(shape=(self.units * 3, self.units), name='aggregation', initializer='glorot_uniform') self.bias = self.add_weight(shape=(self.units * 5, ), name='bias', initializer='zeros') self.built = True if self.dropconnect: self._kernel = K.dropout(self.kernel, self.dropconnect) self._kernel = K.in_train_phase(self._kernel, self.kernel) self._recurrent_kernel = K.dropout(self.recurrent_kernel, self.dropconnect) self._recurrent_kernel = K.in_train_phase(self._recurrent_kernel, self.recurrent_kernel) self._cell_kernel = K.dropout(self.cell_kernel, self.dropconnect) self._cell_kernel = K.in_train_phase(self._cell_kernel, self.cell_kernel) else: self._kernel = self.kernel self._recurrent_kernel = self.recurrent_kernel self._cell_kernel = self.cell_kernel
def rounded_sigmoid(name="rounded_sigmoid"): global rounded_sigmoid_counter rounded_sigmoid_counter += 1 return Lambda( lambda x: K.in_train_phase(K.sigmoid(x), K.round(K.sigmoid(x))), name="{}_{}".format(name, rounded_sigmoid_counter))
def call(self, x, training=None): lenx = tf.shape(x)[1] cp = tf.cast(tf.round((tf.cast(lenx, tf.float32) - .5) * self.pos), tf.int32) cx = x[:, cp, :] return K.in_train_phase(cx, x[:, -1, :], training=training)
def step(self, inputs, states): if self.semantic_condition and self.condition_on_ptm1: h_tm1 = states[0] c_tm1 = states[1] d_tm1 = states[2] p_tm1 = states[3] if self.condition_on_ptm1 and not self.generation_only: inputs = K.in_train_phase( inputs, K.concatenate([inputs, p_tm1], axis=1)) elif self.condition_on_ptm1 and self.generation_only: inputs = K.in_train_phase(inputs, p_tm1) dp_mask = states[4] rec_dp_mask = states[5] sc_dp_mask = states[6] elif not self.semantic_condition and self.condition_on_ptm1: h_tm1 = states[0] c_tm1 = states[1] p_tm1 = states[2] if self.condition_on_ptm1: inputs = K.in_train_phase( inputs, K.concatenate([inputs, p_tm1], axis=1)) dp_mask = states[3] rec_dp_mask = states[4] elif not self.semantic_condition and not self.condition_on_ptm1: h_tm1 = states[0] c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] else: #self.semantic_condition and not self.condition_pm1 h_tm1 = states[0] c_tm1 = states[1] d_tm1 = states[2] dp_mask = states[3] rec_dp_mask = states[4] sc_dp_mask = states[5] z = K.dot(inputs * dp_mask[0], self.kernel) z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] z2 = z[:, 2 * self.units:3 * self.units] z3 = z[:, 3 * self.units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) if self.semantic_condition: r = self.recurrent_activation( K.dot(inputs * dp_mask[0], self.kernel_r) + self.alpha * K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_r)) if self.use_bias: r = K.bias_add(r, self.bias_r) d = r * d_tm1 c = f * c_tm1 + i * self.activation(z2) + self.activation( K.dot(d * sc_dp_mask[0], self.kernel_d)) else: c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) h = o * self.activation(c) #output distibution of target word prob: p in (batch_size, nclasses) if self.softmax_temperature is not None: p_softmax = K.softmax( K.dot(h, self.out_kernel) / self.softmax_temperature) p_ret = p_softmax else: p_softmax = K.softmax(K.dot(h, self.out_kernel)) p_ret = K.in_train_phase( p_softmax, K.one_hot(K.argmax(p_softmax, axis=1), self.out_units)) if 0.0 < self.dropout + self.recurrent_dropout + self.sc_dropout: h._uses_learning_phase = True if self.semantic_condition and self.condition_on_ptm1: return p_softmax, [h, c, d, p_ret] elif not self.semantic_condition and self.condition_on_ptm1: return p_softmax, [h, c, p_ret] elif not self.semantic_condition and not self.condition_on_ptm1: return p_softmax, [h, c] else: return p_softmax, [h, c, d]
def call(self, x): return K.in_train_phase(K.dot(x, self.Q), x)
def call(self, x, mask=None): return K.in_train_phase(K.relu(x, K.random_uniform(K.shape(x), self.l, self.u)), K.relu(x, self.average))
def call(self, x, mask=None): if self.mode == 0 or self.mode == 2: assert self.built, 'Layer must be built before being called' input_shape = self.input_spec[0].shape reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if self.mode == 2: x_normed, mean, std = K.normalize_batch_in_training( x, self.gamma, self.beta, reduction_axes, epsilon=self.epsilon) else: # mode 0 if self.called_with not in {None, x} and False: raise Exception('You are attempting to share a ' 'same `BatchNormalization` layer across ' 'different data flows. ' 'This is not possible. ' 'You should use `mode=2` in ' '`BatchNormalization`, which has ' 'a similar behavior but is shareable ' '(see docs for a description of ' 'the behavior).') self.called_with = x x_normed, mean, std = K.normalize_batch_in_training( x, self.gamma, self.beta, reduction_axes, epsilon=self.epsilon) self.updates = [ K.moving_average_update(self.running_mean, mean, self.momentum), K.moving_average_update(self.running_std, std, self.momentum) ] if sorted(reduction_axes) == range(K.ndim(x))[:-1]: x_normed_running = K.batch_normalization( x, self.running_mean, self.running_std, self.beta, self.gamma, epsilon=self.epsilon) else: # need broadcasting broadcast_running_mean = K.reshape(self.running_mean, broadcast_shape) broadcast_running_std = K.reshape(self.running_std, broadcast_shape) broadcast_beta = K.reshape(self.beta, broadcast_shape) broadcast_gamma = K.reshape(self.gamma, broadcast_shape) x_normed_running = K.batch_normalization( x, broadcast_running_mean, broadcast_running_std, broadcast_beta, broadcast_gamma, epsilon=self.epsilon) # pick the normalized form of x corresponding to the training phase x_normed = K.in_train_phase(x_normed, x_normed_running) elif self.mode == 1: # sample-wise normalization m = K.mean(x, axis=-1, keepdims=True) std = K.sqrt(K.var(x, axis=-1, keepdims=True) + self.epsilon) x_normed = (x - m) / (std + self.epsilon) x_normed = self.gamma * x_normed + self.beta return x_normed
def compute_loss(self, input, output, input_mask=None, output_mask=None): l = K.switch(input < self.low, K.abs(input - self.low), 0) h = K.switch(input > self.high, K.abs(input - self.high), 0) return K.in_train_phase(self.weight*K.mean(h + l), 0)
def group_ksparse(x, groups, k, axis_group, axis_sparse, norm=2, alpha=1, epsilon=None): if isinstance(axis_group, int): axis_group = (axis_group, ) elif isinstance(axis_group, list): axis_group = tuple(axis_group) if isinstance(axis_sparse, int): axis_sparse = (axis_sparse, ) elif isinstance(axis_sparse, list): axis_sparse = tuple(axis_sparse) assert (1 - bool(set(axis_group) & set(axis_sparse))) if epsilon is None: epsilon = K.epsilon() axis_complement = tuple( set(range(K.ndim(x))) - set(axis_group) - set(axis_sparse)) shape_reduce_group = K.prod([K.shape(x)[j] for j in axis_group]) shape_reduce_sparse = K.prod([K.shape(x)[j] for j in axis_sparse]) _k = K.minimum(K.in_train_phase(k, alpha * k), shape_reduce_sparse) inputs_permute_dimensions = K.permute_dimensions( x, axis_complement + axis_sparse + axis_group) inputs_permute_dimensions_reshape = K.reshape( inputs_permute_dimensions, (-1, shape_reduce_sparse, shape_reduce_group)) norm_group_permute_dimensions_reshape = group_norms( inputs=inputs_permute_dimensions_reshape, groups=groups, axis=-1, norm=norm, epsilon=epsilon) norm_group_permute_dimensions_reshape = K.permute_dimensions( norm_group_permute_dimensions_reshape, (0, 2, 1)) norm_group_permute_dimensions_reshape = K.reshape( norm_group_permute_dimensions_reshape, (-1, shape_reduce_sparse)) _, indices = tf.nn.top_k(norm_group_permute_dimensions_reshape, _k) scatter_indices = K.concatenate([ (K.arange(K.shape(norm_group_permute_dimensions_reshape)[0])[:, None] * K.ones((1, _k), dtype='int32'))[:, :, None], indices[:, :, None] ]) scatter_updates = K.ones( (K.shape(norm_group_permute_dimensions_reshape)[0], _k)) mask_group_permute_dimensions_reshape = K.cast( tf.scatter_nd(scatter_indices, scatter_updates, K.shape(norm_group_permute_dimensions_reshape)), K.floatx()) mask_group_permute_dimensions_reshape = K.reshape( mask_group_permute_dimensions_reshape, (-1, groups, shape_reduce_sparse)) mask_group_permute_dimensions_reshape = K.permute_dimensions( mask_group_permute_dimensions_reshape, (0, 2, 1)) mask_permute_dimensions_reshape = ( mask_group_permute_dimensions_reshape[:, :, :, None] * K.ones( (1, 1, 1, floor_div(shape_reduce_group, groups)))) mask_permute_dimensions = K.reshape(mask_permute_dimensions_reshape, K.shape(inputs_permute_dimensions)) mask = K.permute_dimensions( mask_permute_dimensions, tuple(np.argsort(axis_complement + axis_sparse + axis_group))) return mask * x
def call(self, x, mask=None): if 0. < self.p < 1.: x = K.in_train_phase(K.dropout(x, level=self.p), x) return x
def call(self, x, training=None): return K.in_train_phase(K.dot(x, rand_rotate_matrix_symbol()), x, training=training)
def call(self, inputs, mask=None): #print("call") output = K.in_train_phase(self._drop_path(inputs), self._ave(inputs)) return output
#################### # Initialise model # #################### # Restrict GPU memory usage if config.set_gpu is not None: conf = tf.ConfigProto() conf.gpu_options.allow_growth = True conf.gpu_options.visible_device_list = str(config.set_gpu) sess = tf.Session(config=conf) set_session(sess) del config.set_gpu eps_train_var = K.variable(config.train_epsilon) eps = K.in_train_phase(K.stop_gradient(eps_train_var), K.constant(config.eval_epsilon)) k_train_var = K.variable(1) k = K.in_train_phase(K.stop_gradient(k_train_var), K.constant(config.min_k)) if config.augmentation: mean, std = x_train.mean(axis=(0, 1, 2)), x_train.std(axis=(0, 1, 2)) + 1e-6 x_train = (x_train - mean) / std x_valid = (x_valid - mean) / std print("Normalising channels with values", mean, std) else: mean, std = None, None if config.model_name == "SmallCNN": model = SmallCNN(input_shape=input_shape) elif config.model_name == "MediumCNN":
def get_yj_vars(self): return K.transpose(K.in_train_phase(self.vj, self.vjr))
def call(self, inputs, initial_state=None, initial_readout=None, ground_truth=None, mask=None, training=None): # input shape: `(samples, time (padded with zeros), input_dim)` # note that the .build() method of subclasses MUST define # self.input_spec and self.state_spec with complete input shapes. if type(mask) is list: mask = mask[0] if self.model is None: raise Exception('Empty RecurrentModel.') num_req_states = self.num_states if self.readout: num_actual_states = num_req_states - 1 else: num_actual_states = num_req_states if type(inputs) is list: inputs_list = inputs[:] inputs = inputs_list.pop(0) initial_states = inputs_list[:num_actual_states] if len(initial_states) > 0: if self._is_optional_input_placeholder(initial_states[0]): initial_states = self.get_initial_state(inputs) inputs_list = inputs_list[num_actual_states:] if self.readout: initial_readout = inputs_list.pop(0) if self.teacher_force: ground_truth = inputs_list.pop() else: if initial_state is not None: if not isinstance(initial_state, (list, tuple)): initial_states = [initial_state] else: initial_states = list(initial_state) if self._is_optional_input_placeholder(initial_states[0]): initial_states = self.get_initial_state(inputs) elif self.stateful: initial_states = self.states else: initial_states = self.get_initial_state(inputs) if self.readout: if initial_readout is None or self._is_optional_input_placeholder( initial_readout): output_shape = K.int_shape(_to_list((self.model.output))[0]) output_ndim = len(output_shape) input_ndim = K.ndim(inputs) initial_readout = K.zeros_like(inputs) slices = [slice(None)] + [0] * (input_ndim - 1) initial_readout = initial_readout[slices] # (batch_size,) initial_readout = K.reshape(initial_readout, (-1, ) + (1, ) * (output_ndim - 1)) initial_readout = K.tile(initial_readout, (1, ) + tuple(output_shape[1:])) initial_states.append(initial_readout) if self.teacher_force: if ground_truth is None or self._is_optional_input_placeholder( ground_truth): raise Exception( 'ground_truth must be provided for RecurrentModel with teacher_force=True.' ) if K.backend() == 'tensorflow': with tf.control_dependencies(None): counter = K.zeros((1, )) else: counter = K.zeros((1, )) counter = K.cast(counter, 'int32') initial_states.insert(-1, counter) initial_states[-2] initial_states.insert(-1, ground_truth) num_req_states += 2 if len(initial_states) != num_req_states: raise ValueError('Layer requires ' + str(num_req_states) + ' states but was passed ' + str(len(initial_states)) + ' initial states.') input_shape = K.int_shape(inputs) if self.unroll and input_shape[1] is None: raise ValueError('Cannot unroll a RNN if the ' 'time dimension is undefined. \n' '- If using a Sequential model, ' 'specify the time dimension by passing ' 'an `input_shape` or `batch_input_shape` ' 'argument to your first layer. If your ' 'first layer is an Embedding, you can ' 'also use the `input_length` argument.\n' '- If using the functional API, specify ' 'the time dimension by passing a `shape` ' 'or `batch_shape` argument to your Input layer.') preprocessed_input = self.preprocess_input(inputs, training=None) constants = self.get_constants(inputs, training=None) if self.decode: initial_states.insert(0, inputs) preprocessed_input = K.zeros((1, self.output_length, 1)) input_length = self.output_length else: input_length = input_shape[1] if self.uses_learning_phase: with learning_phase_scope(0): last_output_test, outputs_test, states_test, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) with learning_phase_scope(1): last_output_train, outputs_train, states_train, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) last_output = K.in_train_phase(last_output_train, last_output_test, training=training) outputs = K.in_train_phase(outputs_train, outputs_test, training=training) states = [] for state_train, state_test in zip(states_train, states_test): states.append( K.in_train_phase(state_train, state_test, training=training)) else: last_output, outputs, states, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) states = list(states) if self.decode: states.pop(0) if self.readout: states.pop() if self.teacher_force: states.pop() states.pop() if len(updates) > 0: self.add_update(updates) if self.stateful: updates = [] for i in range(len(states)): updates.append((self.states[i], states[i])) self.add_update(updates, inputs) # Properly set learning phase if 0 < self.dropout + self.recurrent_dropout: last_output._uses_learning_phase = True outputs._uses_learning_phase = True if self.return_sequences: y = outputs else: y = last_output if self.return_states: return [y] + states else: return y
def smooth_min(*args): return K.in_train_phase( -K.logsumexp(-K.stack(args, axis=0), axis=0) + K.log(2.0), K.minimum(*args))
def smooth_max(*args): return K.in_train_phase( K.logsumexp(K.stack(args, axis=0), axis=0) - K.log(2.0), K.maximum(*args))
def call(self, x, mask=None): y_pred = viterbi_decode(x, self.U, self.b_start, self.b_end, mask) nb_classes = self.input_spec[0].shape[2] y_pred_one_hot = K.one_hot(y_pred, nb_classes) return K.in_train_phase(x, y_pred_one_hot)
def call(self, inputs, training=None): def noised(): return inputs + K.random_normal( shape=K.shape(inputs), mean=0., stddev=self.stddev) return K.in_train_phase(noised, noised, training=training)
def call(self, inputs, training=None): input_shape = K.int_shape(inputs) ndim = len(input_shape) reduction_axes = list(range(ndim)) del reduction_axes[self.axis] input_dim = input_shape[self.axis] // 4 mu = K.mean(inputs, axis=reduction_axes) broadcast_mu_shape = [1] * len(input_shape) broadcast_mu_shape[self.axis] = input_shape[self.axis] broadcast_mu = K.reshape(mu, broadcast_mu_shape) if self.center: input_centred = inputs - broadcast_mu else: input_centred = inputs centred_squared = input_centred ** 2 start_i = input_dim start_j = input_dim*2 start_k = input_dim*3 if (self.axis == 1 and ndim != 3) or ndim == 2: centred_squared_r = centred_squared[:, :input_dim] centred_squared_i = centred_squared[:, input_dim:input_dim*2] centred_squared_j = centred_squared[:, input_dim*2:input_dim*3] centred_squared_k = centred_squared[:, input_dim*3:] centred_r = input_centred[:, :input_dim] centred_i = input_centred[:, input_dim:input_dim*2] centred_j = input_centred[:, input_dim*2:input_dim*3] centred_k = input_centred[:, input_dim*3:] elif ndim == 3: centred_squared_r = centred_squared[:, :, :input_dim] centred_squared_i = centred_squared[:, :, input_dim:input_dim*2] centred_squared_j = centred_squared[:, :, input_dim*2:input_dim*3] centred_squared_k = centred_squared[:, :, input_dim*3:] centred_r = input_centred[:, :, :input_dim] centred_i = input_centred[:, :, input_dim:input_dim*2] centred_j = input_centred[:, :, input_dim*2:input_dim*3] centred_k = input_centred[:, :, input_dim*3:] elif self.axis == -1 and ndim == 4: centred_squared_r = centred_squared[:, :, :, :input_dim] centred_squared_i = centred_squared[:, :, :, input_dim:input_dim*2] centred_squared_j = centred_squared[:, :, :, input_dim*2:input_dim*3] centred_squared_k = centred_squared[:, :, :, input_dim*3:] centred_r = input_centred[:, :, :, :input_dim] centred_i = input_centred[:, :, :, input_dim:input_dim*2] centred_j = input_centred[:, :, :, input_dim*2:input_dim*3] centred_k = input_centred[:, :, :, input_dim*3:] elif self.axis == -1 and ndim == 5: centred_squared_r = centred_squared[:, :, :, :, :input_dim] centred_squared_i = centred_squared[:, :, :, :, input_dim:input_dim*2] centred_squared_j = centred_squared[:, :, :, :, input_dim*2:input_dim*3] centred_squared_k = centred_squared[:, :, :, :, input_dim*3:] centred_r = input_centred[:, :, :, :, :input_dim] centred_i = input_centred[:, :, :, :, input_dim:input_dim*2] centred_j = input_centred[:, :, :, :, input_dim*2:input_dim*3] centred_k = input_centred[:, :, :, :, input_dim*3:] else: raise ValueError( 'Incorrect Batchnorm combination of axis and dimensions. axis should be either 1 or -1. ' 'axis: ' + str(self.axis) + '; ndim: ' + str(ndim) + '.' ) if self.scale: # #Variances: Vrr = K.mean( centred_squared_r, axis=reduction_axes ) + self.epsilon Vii = K.mean( centred_squared_i, axis=reduction_axes ) + self.epsilon Vjj = K.mean( centred_squared_j, axis=reduction_axes ) + self.epsilon Vkk = K.mean( centred_squared_k, axis=reduction_axes ) + self.epsilon #Co-Variances: Vri = K.mean( centred_r * centred_i, axis=reduction_axes, ) Vrj = K.mean( centred_r * centred_j, axis=reduction_axes, ) Vrk = K.mean( centred_r * centred_k, axis=reduction_axes, ) Vij = K.mean( centred_i * centred_j, axis=reduction_axes, ) Vik = K.mean( centred_i * centred_k, axis=reduction_axes, ) Vjk = K.mean( centred_j * centred_k, axis=reduction_axes, ) elif self.center: Vrr = None Vri = None Vrj = None Vrk = None Vii = None Vij = None Vik = None Vjj = None Vjk = None Vkk = None else: raise ValueError('Error. Both scale and center in batchnorm are set to False.') input_bn = QuaternionBN( input_centred, Vrr, Vri, Vrj, Vrk, Vii, Vij, Vik, Vjj, Vjk, Vkk, self.beta, self.gamma_rr, self.gamma_ri, self.gamma_rj, self.gamma_rk, self.gamma_ii, self.gamma_ij, self.gamma_ik, self.gamma_jj, self.gamma_jk, self.gamma_kk, self.scale, self.center, axis=self.axis) if training in {0, False}: return input_bn else: update_list = [] if self.center: update_list.append(K.moving_average_update(self.moving_mean, mu, self.momentum)) if self.scale: update_list.append(K.moving_average_update(self.moving_Vrr, Vrr, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vri, Vri, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vrk, Vrk, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vrj, Vrj, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vii, Vii, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vij, Vij, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vik, Vik, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vjj, Vjj, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vjk, Vjk, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vkk, Vkk, self.momentum)) self.add_update(update_list, inputs) def normalize_inference(): if self.center: inference_centred = inputs - K.reshape(self.moving_mean, broadcast_mu_shape) else: inference_centred = inputs return QuaternionBN( inference_centred, self.moving_Vrr, self.moving_Vri, self.moving_Vrj, self.moving_Vrk, self.moving_Vii, self.moving_Vij,self.moving_Vik, self.moving_Vjj, self.moving_Vjk, self.moving_Vkk, self.beta, self.gamma_rr, self.gamma_ri, self.gamma_rj, self.gamma_rk, self.gamma_ii, self.gamma_ij, self.gamma_ik, self.gamma_jj, self.gamma_jk, self.gamma_kk, self.scale, self.center, axis=self.axis) ## Pick the normalized form corresponding to the training phase. return K.in_train_phase(input_bn,normalize_inference,training=training)
def call(self, inputs, training=None): input_shape = K.int_shape(inputs) ndim = len(input_shape) reduction_axes = list(range(ndim)) del reduction_axes[self.axis] input_dim = input_shape[self.axis] // 2 mu = K.mean(inputs, axis=reduction_axes) broadcast_mu_shape = [1] * len(input_shape) broadcast_mu_shape[self.axis] = input_shape[self.axis] broadcast_mu = K.reshape(mu, broadcast_mu_shape) if self.center: input_centred = inputs - broadcast_mu else: input_centred = inputs centred_squared = input_centred ** 2 if (self.axis == 1 and ndim != 3) or ndim == 2: centred_squared_real = centred_squared[:, :input_dim] centred_squared_imag = centred_squared[:, input_dim:] centred_real = input_centred[:, :input_dim] centred_imag = input_centred[:, input_dim:] elif ndim == 3: centred_squared_real = centred_squared[:, :, :input_dim] centred_squared_imag = centred_squared[:, :, input_dim:] centred_real = input_centred[:, :, :input_dim] centred_imag = input_centred[:, :, input_dim:] elif self.axis == -1 and ndim == 4: centred_squared_real = centred_squared[:, :, :, :input_dim] centred_squared_imag = centred_squared[:, :, :, input_dim:] centred_real = input_centred[:, :, :, :input_dim] centred_imag = input_centred[:, :, :, input_dim:] elif self.axis == -1 and ndim == 5: centred_squared_real = centred_squared[:, :, :, :, :input_dim] centred_squared_imag = centred_squared[:, :, :, :, input_dim:] centred_real = input_centred[:, :, :, :, :input_dim] centred_imag = input_centred[:, :, :, :, input_dim:] else: raise ValueError( 'Incorrect Batchnorm combination of axis and dimensions. axis should be either 1 or -1. ' 'axis: ' + str(self.axis) + '; ndim: ' + str(ndim) + '.' ) if self.scale: Vrr = K.mean( centred_squared_real, axis=reduction_axes ) + self.epsilon Vii = K.mean( centred_squared_imag, axis=reduction_axes ) + self.epsilon # Vri contains the real and imaginary covariance for each feature map. Vri = K.mean( centred_real * centred_imag, axis=reduction_axes, ) elif self.center: Vrr = None Vii = None Vri = None else: raise ValueError('Error. Both scale and center in batchnorm are set to False.') input_bn = ComplexBN( input_centred, Vrr, Vii, Vri, self.beta, self.gamma_rr, self.gamma_ri, self.gamma_ii, self.scale, self.center, axis=self.axis ) if training in {0, False}: return input_bn else: update_list = [] if self.center: update_list.append(K.moving_average_update(self.moving_mean, mu, self.momentum)) if self.scale: update_list.append(K.moving_average_update(self.moving_Vrr, Vrr, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vii, Vii, self.momentum)) update_list.append(K.moving_average_update(self.moving_Vri, Vri, self.momentum)) self.add_update(update_list, inputs) def normalize_inference(): if self.center: inference_centred = inputs - K.reshape(self.moving_mean, broadcast_mu_shape) else: inference_centred = inputs return ComplexBN( inference_centred, self.moving_Vrr, self.moving_Vii, self.moving_Vri, self.beta, self.gamma_rr, self.gamma_ri, self.gamma_ii, self.scale, self.center, axis=self.axis ) # Pick the normalized form corresponding to the training phase. return K.in_train_phase(input_bn, normalize_inference, training=training)
def assister_loss(y_true, y_pred): return KB.in_train_phase(KL.categorical_crossentropy(y_true, y_pred), KB.zeros_like(KL.categorical_crossentropy(y_true, y_pred)))
def call(self, inputs): def noised(): return inputs + K.random_uniform(shape=K.shape(inputs), minval=self.minval, maxval=self.maxval) return K.in_train_phase(noised, noised)