def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = K.maximum(self.beta_2 * u, K.abs(g)) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (u_t + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(u, u_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) shapes = [K.get_variable_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] delta_accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators self.updates = [] for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - get_learing_rate(p,self.lr) * update # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) self.updates.append(K.update(d_a, new_d_a)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_output(self, train=False): x = self.get_input(train) x_shape = K.shape(x) #stacked = K.concatenate( [K.reshape(x, (x_shape[0], x_shape[1], 1)), K.zeros( (x_shape[0], x_shape[1], self.n-1) )], axis=2 ) K.zeros( (x_shape[0], x_shape[1], self.n-1) ) #stacked = K.concatenate( [K.reshape(x, (x_shape[0], x_shape[1], 1)), K.zeros( (x_shape[0], x_shape[1], self.n-1) )], axis=2 ) return stacked.dimshuffle((0, 2, 1))
def reset_states(self): assert self.stateful, 'Layer must be stateful.' input_shape = self.input_shape if not input_shape[0]: raise Exception('If a RNN is stateful, a complete ' + 'input_shape must be provided ' + '(including batch size).') if self.return_sequences: out_row, out_col, out_filter = self.output_shape[2:] else: out_row, out_col, out_filter = self.output_shape[1:] if hasattr(self, 'states'): K.set_value(self.states[0], np.zeros((input_shape[0], out_row, out_col, out_filter))) K.set_value(self.states[1], np.zeros((input_shape[0], out_row, out_col, out_filter))) else: self.states = [K.zeros((input_shape[0], out_row, out_col, out_filter)), K.zeros((input_shape[0], out_row, out_col, out_filter))]
def build(self): f_init = self.get_function('init') f_inner_init = self.get_function('inner_init') f_forget_bias_init = self.get_function('forget_bias_init') # numpy matrixes W_i = f_init((self.input_dim, self.output_dim), name=self.name + '_W_i').get_value() U_i = f_inner_init((self.output_dim, self.output_dim), name=self.name + '_U_i').get_value() b_i = K.zeros((self.output_dim,), name=self.name + '_b_i').get_value() W_f = f_init((self.input_dim, self.output_dim), name=self.name + '_W_f').get_value() U_f = f_inner_init((self.output_dim, self.output_dim), name=self.name + '_U_f').get_value() b_f = f_forget_bias_init((self.output_dim,), name=self.name + '_b_f').get_value() W_c = f_init((self.input_dim, self.output_dim), name=self.name + '_W_c').get_value() U_c = f_inner_init((self.output_dim, self.output_dim), name=self.name + '_U_c').get_value() b_c = K.zeros((self.output_dim,), name=self.name + '_b_c').get_value() W_o = f_init((self.input_dim, self.output_dim), name=self.name + '_W_o').get_value() U_o = f_inner_init((self.output_dim, self.output_dim), name=self.name + '_U_o').get_value() b_o = K.zeros((self.output_dim,), name=self.name + '_b_o').get_value() # theano variables self.W = theano.shared(numpy.concatenate([W_i, W_f, W_c, W_o], axis=1), name=self.name + '_W' , strict=False) self.U = theano.shared(numpy.concatenate([U_i, U_f, U_c, U_o], axis=1), name=self.name + '_U' , strict=False) self.b = theano.shared(numpy.concatenate([b_i, b_f, b_c, b_o]), name=self.name + '_b' , strict=False) self.params = [self.W, self.U, self.b]
def reset_states(self): assert self.stateful, 'Layer must be stateful.' input_shape = self.input_shape (batch_size, tsteps, xsize) = input_shape if not input_shape[0]: raise Exception('If a RNN is stateful, a complete ' + 'input_shape must be provided ' + '(including batch size).') if hasattr(self, 'states'): K.set_value(self.states[0], np.zeros((batch_size, self.output_dim))) K.set_value(self.states[1], np.zeros((1), dtype="i")) K.set_value(self.states[0], np.zeros((batch_size, self.output_dim))) else: self.states = [K.zeros((batch_size, self.output_dim), name="stateA"), # K.variable(0), # theano.shared(0), K.zeros((1), name="stateB", dtype="int32"), K.zeros((batch_size, self.output_dim), name="stateC"), ]
def build(self): self.W1 = self.init((self.input_dim, self.n_classes), name='{}_W1'.format(self.name)) self.b1 = K.zeros((self.n_classes,), name='{}_b1'.format(self.name)) self.W2 = self.init((self.n_classes, self.input_dim, self.n_outputs_per_class), name='{}_W2'.format(self.name)) self.b2 = K.zeros((self.n_classes, self.n_outputs_per_class), name='{}_b2'.format(self.name)) self.trainable_weights = [self.W1, self.b1, self.W2, self.b2]
def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] nb_input_channel = input_shape[2] W_shape = (self.nb_filter, nb_input_channel, self.nb_row, self.nb_col) U_shape = (self.nb_filter, self.nb_filter, self.nb_row, self.nb_col) C_shape = (self.nb_filter, input_shape[3], input_shape[4]) b_shape = (self.nb_filter,) self.W_i = self.init(W_shape, name="{}_W_i".format(self.name)) self.U_i = self.inner_init(U_shape, name="{}_U_i".format(self.name)) self.C_i = self.inner_init(C_shape, name="{}_C_i".format(self.name)) self.b_i = K.zeros(b_shape, name="{}_b_i".format(self.name)) self.W_f = self.init(W_shape, name="{}_W_f".format(self.name)) self.U_f = self.inner_init(U_shape, name="{}_U_f".format(self.name)) self.C_f = self.inner_init(C_shape, name="{}_C_f".format(self.name)) self.b_f = self.forget_bias_init(b_shape, name="{}_b_f".format(self.name)) self.W_c = self.init(W_shape, name="{}_W_c".format(self.name)) self.U_c = self.inner_init(U_shape, name="{}_U_c".format(self.name)) self.b_c = K.zeros(b_shape, name="{}_b_c".format(self.name)) self.W_o = self.init(W_shape, name="{}_W_o".format(self.name)) self.U_o = self.inner_init(U_shape, name="{}_U_o".format(self.name)) self.C_o = self.inner_init(C_shape, name="{}_C_o".format(self.name)) self.b_o = K.zeros(b_shape, name="{}_b_o".format(self.name)) self.trainable_weights = [self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.C_i, self.C_f, self.C_o]
def build(self): #print('self.input_shape', self.input_shape) n_features = self.input_shape[1] self.W1 = self.init((n_features, self.nb_hsm_classes)) self.b1 = K.zeros((self.nb_hsm_classes,)) self.W2 = self.init((self.nb_hsm_classes, n_features, self.nb_outputs_per_class)) self.b2 = K.zeros((self.nb_hsm_classes, self.nb_outputs_per_class)) self.trainable_weights = [self.W1, self.b1, self.W2, self.b2] self.regularizers = [] if self.W1_regularizer: self.W1_regularizer.set_param(self.W1) self.regularizers.append(self.W1_regularizer) if self.b1_regularizer: self.b1_regularizer.set_param(self.b1) self.regularizers.append(self.b1_regularizer) if self.W2_regularizer: self.W2_regularizer.set_param(self.W2) self.regularizers.append(self.W2_regularizer) if self.b2_regularizer: self.b2_regularizer.set_param(self.b2) self.regularizers.append(self.b2_regularizer)
def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] if self.stateful: self.reset_states() else: # initial states: all-zero tensor of shape (output_dim) self.states = [None] input_dim = input_shape[2] self.input_dim = input_dim self.V = self.init((self.output_dim, input_dim-self.control_dim), name='{}_V'.format(self.name)) self.W = self.init((input_dim, self.output_dim), name='{}_W'.format(self.name)) self.U = self.inner_init((self.output_dim, self.output_dim), name='{}_U'.format(self.name)) self.b = K.zeros((self.output_dim,), name='{}_b'.format(self.name)) self.ext_b = K.zeros((input_dim-self.control_dim,), name='{}_ext_b'.format(self.name)) self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W) self.regularizers.append(self.W_regularizer) if self.U_regularizer: self.U_regularizer.set_param(self.U) self.regularizers.append(self.U_regularizer) if self.b_regularizer: self.b_regularizer.set_param(self.b) self.regularizers.append(self.b_regularizer) self.trainable_weights = [self.W, self.U, self.b, self.V, self.ext_b] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self, input_shape): assert len(input_shape) >= 3 self.input_spec = [InputSpec(shape=input_shape)] if not self.layer.built: self.layer.build(input_shape) self.layer.built = True super(AttentionLSTMWrapper, self).build() if hasattr(self.attention_vec, '_keras_shape'): attention_dim = self.attention_vec._keras_shape[1] else: raise Exception('Layer could not be build: No information about expected input shape.') self.U_a = self.layer.inner_init((self.layer.output_dim, self.layer.output_dim), name='{}_U_a'.format(self.name)) self.b_a = K.zeros((self.layer.output_dim,), name='{}_b_a'.format(self.name)) self.U_m = self.layer.inner_init((attention_dim, self.layer.output_dim), name='{}_U_m'.format(self.name)) self.b_m = K.zeros((self.layer.output_dim,), name='{}_b_m'.format(self.name)) if self.single_attention_param: self.U_s = self.layer.inner_init((self.layer.output_dim, 1), name='{}_U_s'.format(self.name)) self.b_s = K.zeros((1,), name='{}_b_s'.format(self.name)) else: self.U_s = self.layer.inner_init((self.layer.output_dim, self.layer.output_dim), name='{}_U_s'.format(self.name)) self.b_s = K.zeros((self.layer.output_dim,), name='{}_b_s'.format(self.name)) self.trainable_weights = [self.U_a, self.U_m, self.U_s, self.b_a, self.b_m, self.b_s]
def build(self, input_shape): super(AttentionLSTM, self).build(input_shape) if hasattr(self.attention_vec, '_keras_shape'): attention_dim = self.attention_vec._keras_shape[1] else: raise Exception('Layer could not be build: No information about expected input shape.') self.U_a = self.inner_init((self.output_dim, self.output_dim), name='{}_U_a'.format(self.name)) self.b_a = K.zeros((self.output_dim,), name='{}_b_a'.format(self.name)) # U_m is the weight corresponding to image feature self.U_m = self.inner_init((attention_dim, self.output_dim), name='{}_U_m'.format(self.name)) # b_m is the bias for the MLP in the calculation of tau self.b_m = K.zeros((self.output_dim,), name='{}_b_m'.format(self.name)) if self.single_attention_param: self.U_s = self.inner_init((self.output_dim, 1), name='{}_U_s'.format(self.name)) self.b_s = K.zeros((1,), name='{}_b_s'.format(self.name)) else: self.U_s = self.inner_init((self.output_dim, self.output_dim), name='{}_U_s'.format(self.name)) self.b_s = K.zeros((self.output_dim,), name='{}_b_s'.format(self.name)) self.trainable_weights += [self.U_a, self.U_m, self.U_s, self.b_a, self.b_m, self.b_s] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self, input_shape): self.input_spec = [InputSpec(shape=shape) for shape in input_shape] input_dim = self.input_spec[0].shape[-1] self.W1 = self.init((input_dim, self.n_classes), name='{}_W1'.format(self.name)) self.b1 = K.zeros((self.n_classes,), name='{}_b1'.format(self.name)) self.W2 = self.init((self.n_classes, input_dim, self.n_outputs_per_class), name='{}_W2'.format(self.name)) self.b2 = K.zeros((self.n_classes, self.n_outputs_per_class), name='{}_b2'.format(self.name)) self.trainable_weights = [self.W1, self.b1, self.W2, self.b2]
def build(self): super(AttentionDecoder, self).build() dim = self.input_dim hdim = self.hidden_dim self.input_length = self.input_shape[-2] if not self.input_length: raise Exception ('AttentionDecoder requires input_length.') self.W_h = self.init((dim, hdim)) self.b_h = K.zeros((hdim, )) self.W_a = self.init((hdim, 1)) self.b_a = K.zeros((1,)) self.trainable_weights += [self.W_a, self.b_a, self.W_h, self.b_h]
def build(self, input_shape): """Initializes trainable weights.""" x_input_shape, xp_input_shape = input_shape # Unpack n_feat = xp_input_shape[1] self.lstm = LSTMStep(n_feat) self.q_init = K.zeros([self.n_test, n_feat]) self.r_init = K.zeros([self.n_test, n_feat]) self.states_init = self.lstm.get_initial_states([self.n_test, n_feat]) self.trainable_weights = [self.q_init, self.r_init]
def build(self): input_dim = self.input_shape[1] self.W_mean = self.init((input_dim, self.output_dim)) self.b_mean = K.zeros((self.output_dim,)) self.W_logsigma = self.init((input_dim, self.output_dim)) self.b_logsigma = K.zeros((self.output_dim,)) self.trainable_weights = [self.W_mean, self.b_mean, self.W_logsigma, self.b_logsigma] self.regularizers = [] reg = self.get_variational_regularization(self.get_input()) self.regularizers.append(reg)
def build(self): input_shape = self.input_shape input_dim = input_shape[2] # = |x| # works only for stateful? (todo: try) self.input_dim = input_dim self.input = K.placeholder(input_shape) # from IPython import embed; embed() # output dim = |c| = |h| = |output| # input dim = |x| if self.stateful: self.reset_states() else: # initial states: 2 all-zero tensor of shape (output_dim) self.states = [None, None,None] # input_dim x output_dim # output dim = 50 = |h|? input_dim = self.input_dim output_dim = self.output_dim n = self.output_dim // len(self.periods) mask = np.zeros((self.output_dim, self.output_dim)) period = np.zeros((self.output_dim, ), 'i') for i, T in enumerate(self.periods): mask[i*n:(i+1)*n, i*n:] = 1 period[i*n:(i+1)*n] = T # from IPython import embed; embed() self.mask = K.zeros((self.output_dim, self.output_dim)) self.period = K.zeros((self.output_dim, ), 'i') K.set_value(self.mask, mask) K.set_value(self.period, period) ## todo: mask & period are shared # n: K.zeros is shared by default (?) self.hh = self.init((self.output_dim, self.output_dim)) self.xh = self.init((self.input_dim, self.output_dim)) self.b = K.zeros((self.output_dim,), name="b") self.trainable_weights = [self.hh, self.xh, self.b] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x',0).repeat(batch_size,axis=0) init_V = K.zeros((self.n_slots,self.m_length)).dimshuffle('x',0,1).repeat(batch_size,axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x',0).repeat(batch_size,axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1,),dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r , init_V,init_S,itime,init_h,init_c]
def build(self): input_dim = self.input_shape[1] self.W_mean = self.init((input_dim, self.output_dim)) self.b_mean = K.zeros((self.output_dim,)) self.W_logsigma = self.init((input_dim, self.output_dim)) self.b_logsigma = K.zeros((self.output_dim,)) self.params = [self.W_mean, self.b_mean, self.W_logsigma, self.b_logsigma] self.regularizers = [] mean, logsigma = self.get_mean_logsigma() self.regularizers.append(GaussianKL(mean, logsigma))
def reset_states(self): assert self.stateful, 'Layer must be stateful.' input_shape = self.input_spec[0].shape if not input_shape[0]: raise Exception('If a RNN is stateful, a complete ' + 'input_shape must be provided (including batch size).') if hasattr(self, 'states'): K.set_value(self.states[0], np.zeros((input_shape[0], self.output_dim))) K.set_value(self.states[1], np.zeros((input_shapes[1], input_shape[0], self.output_dim))) else: self.states = [K.zeros((input_shape[0], self.output_dim)), K.zeros((input_shapes[1], input_shape[0], self.output_dim))]
def build(self, input_shape): # Input shape :: (samples, channels, height, width) self.input_spec = [InputSpec(shape=input_shape)] if self.direction == 'Down': dims = self.input_spec[0].shape self.shuffeled_dims = (dims[0], dims[3], dims[1], dims[2]) elif self.direction == 'Right': dims = self.input_spec[0].shape self.shuffeled_dims = (dims[0], dims[2], dims[1], dims[3]) else: raise Exception('ERROR: Unknown direction') input_dim = self.shuffeled_dims[2] self.input_dim = input_dim self.Shape = (4*self.nb_filter, input_dim, 1, 1) self.Shape1 = (4*self.nb_filter, self.nb_filter, 2, 1) self.Shape2 = (self.nb_filter, self.shuffeled_dims[3]) self.W_iof = self.init(self.Shape) self.U_iof = self.init(self.Shape1) self.b_iof = K.zeros((4*self.nb_filter,)) self.init_h = K.zeros(self.Shape2) self.init_c = K.zeros(self.Shape2) if self.stateful: self.reset_states() else: self.states = [None, None] self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W_iof) self.regularizers.append(self.W_regularizer) if self.U_regularizer: self.U_regularizer.set_param(self.U_iof) self.regularizers.append(self.U_regularizer) if self.b_regularizer: self.b_regularizer.set_param(self.b_iof) self.regularizers.append(self.b_regularizer) self.trainable_weights = [self.W_iof, self.U_iof, self.b_iof, self.init_h, self.init_c] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self): input_shape = self.input_shape input_dim = input_shape[2] # = |x| # works only for stateful? (todo: try) self.input_dim = input_dim self.input = K.placeholder(input_shape) # output dim = |c| = |h| = |output| # input dim = |x| if self.stateful: self.reset_states() else: # initial states: 2 all-zero tensor of shape (output_dim) self.states = [None, None] # input_dim x output_dim # output dim = 50 = |h|? input_dim = self.input_dim output_dim = self.output_dim self.W_fx = self.init((input_dim, output_dim)) self.W_fh = self.inner_init((output_dim, output_dim)) self.b_f = self.forget_bias_init((output_dim, )) self.W_ix = self.init((input_dim, output_dim)) self.W_ih = self.inner_init((output_dim, output_dim)) self.b_i = K.zeros((output_dim, )) self.W_cx = self.init((input_dim, output_dim)) self.W_ch = self.inner_init((output_dim, output_dim)) self.b_c = K.zeros((output_dim, )) self.W_ox = self.init((input_dim, output_dim)) self.W_oh = self.inner_init((output_dim, output_dim)) self.b_o = K.zeros((output_dim, )) self.trainable_weights = [self.W_fx, self.W_fh, self.b_f, self.W_ix, self.W_ih, self.b_i, self.W_cx, self.W_ch, self.b_c, self.W_ox, self.W_oh, self.b_o, ] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self,input_shape): self.W = self._conv_layer.W if self.dim_ordering == 'th': self.W_shape = (self.nb_out_channels, self.nb_filter, self.nb_row, self.nb_col) elif self.dim_ordering == 'tf': self.W_shape = (self.nb_row, self.nb_col, self.nb_out_channels, self.nb_filter) else: raise Exception('Invalid dim_ordering: ' + self.dim_ordering) self.b = K.zeros((self.nb_out_channels,)) self.params = [self.b] self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W) self.regularizers.append(self.W_regularizer) if self.b_regularizer: self.b_regularizer.set_param(self.b) self.regularizers.append(self.b_regularizer) if self.activity_regularizer: self.activity_regularizer.set_layer(self) self.regularizers.append(self.activity_regularizer) if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): if p.name in self.lr_mult: multiplied_lr = lr * self.lr_mult[p.name] else: multiplied_lr = lr v = self.momentum * m - multiplied_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - multiplied_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_initial_states(self, X): # build an all-zero tensor of shape (samples, hidden_dim) initial_state = K.zeros_like(X) # (samples, input_dim) reducer = K.zeros((self.input_dim, self.hidden_dim)) initial_state = K.dot(initial_state, reducer) # (samples, hidden_dim) initial_states = [initial_state for _ in range(len(self.states))] return initial_states
def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='linear', weights=None, W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None): self.input_dim = input_dim self.output_dim = output_dim self.init = initializations.get(init) self.activation = activations.get(activation) ''' self.W_regularizer = regularizers.get(W_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.activity_regularizer = regularizers.get(activity_regularizer) self.W_constraint = constraints.get(W_constraint) self.b_constraint = constraints.get(b_constraint) self.constraints = [self.W_constraint, self.b_constraint] self.initial_weights = weights ''' #super(TimeDistributedDense, self).__init__(**kwargs) #def build(self): self.W = self.init((self.input_dim, self.output_dim)) self.b = K.zeros((self.output_dim,)) self.params = [self.W, self.b] '''
def build(self, input_shape): input_dim = input_shape[2] _, output_length, nb_filter = self.get_output_shape_for(input_shape) self.W_shape = (output_length, self.filter_length * input_dim, nb_filter) self.W = self.init(self.W_shape, name='{}_W'.format(self.name)) if self.bias: self.b = K.zeros((output_length, self.nb_filter), name='{}_b'.format(self.name)) self.trainable_weights = [self.W, self.b] else: self.trainable_weights = [self.W] self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W) self.regularizers.append(self.W_regularizer) if self.b_regularizer: self.b_regularizer.set_param(self.b) self.regularizers.append(self.b_regularizer) if self.activity_regularizer: self.activity_regularizer.set_layer(self) self.regularizers.append(self.activity_regularizer) self.constraints = {} if self.W_constraint: self.constraints[self.W] = self.W_constraint if self.b_constraint: self.constraints[self.b] = self.b_constraint if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self): input_dim = (self.input_shape[1], self.input_shape[2]) self.W = self.init((self.output_dim[0], input_dim[0])) self.b = K.zeros((self.output_dim[0], self.output_dim[1])) if self.has_bias: print("training bias unit as well") self.trainable_weights = [self.W, self.b] self.params = [self.W, self.b] else: self.trainable_weights = [self.W] self.params = [self.W] self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param(self.W) self.regularizers.append(self.W_regularizer) #if self.b_regularizer: # self.b_regularizer.set_param(self.b) # self.regularizers.append(self.b_regularizer) if self.activity_regularizer: self.activity_regularizer.set_layer(self) self.regularizers.append(self.activity_regularizer) if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def build(self, input_shape): stack_size = input_shape[1] self.W_shape = (stack_size, self.nb_filter, self.nb_row, self.nb_col) w = self.init(self.W_shape) self.W = K.variable(K.get_value(w).reshape(self.W_shape)) self.b = K.zeros((self.nb_filter,)) self._trainable_weights = [self.W, self.b]
def EnrichedLSTM(sparse_size, vocab_size, max_length, method='init', embedding_size=200, embeddings_dropout=0.2, hidden_size=100, recurrent_dropout=0.0, output_size=2, trainable_records=True, encoding_layer=None): # Reading the sparse version of the EHR variables input_record = Input(shape=(sparse_size, ), name='ehr_input') # Embedding the EHR variables, optionally with pretrained weights if encoding_layer != None: ae_weights = encoding_layer.get_weights() record_embedding_layer = Dense(units=embedding_size, weights=ae_weights, trainable=trainable_records, name='ehr_embedding') else: record_embedding_layer = Dense(units=embedding_size, trainable=trainable_records, name='ehr_embedding') embedded_record = record_embedding_layer(input_record) # Building an embedding layer for the free text in the record input_text = Input(shape=(max_length, ), name='text_input') embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_size, mask_zero=True, name='text_embedding') text_embedding = embedding_layer(input_text) # Setting the activation for the final layer if output_size == 1: activation = 'sigmoid' else: activation = 'softmax' # Setting up the RNN rnn = LSTM(units=hidden_size, dropout=embeddings_dropout, recurrent_dropout=recurrent_dropout, return_sequences=False, return_state=True, name='rnn') # First option: pass the record as the initial state for the RNN if method == 'init': # Reshaping the record embedding reshaped_record = Reshape((1, embedding_size))(embedded_record) # Zero state for the RNN layer batch_size = K.shape(input_record)[0] zero_state = [ K.zeros((batch_size, hidden_size)), K.zeros((batch_size, hidden_size)) ] # Running the record through the RNN first, and then the text rec_out, rec_h, rec_c = rnn(reshaped_record, initial_state=zero_state) pre_dense, _, _ = rnn(text_embedding, initial_state=[rec_h, rec_c]) # Second option: concat the RNN output and the record before softmax elif method == 'post': rnn_output, _, _ = rnn(text_embedding) pre_dense = concatenate([embedded_record, rnn_output]) # Third option: concat the word embeddings and the (repeated) record emb. elif method == 'word': repeated_record = RepeatVector(max_length)(embedded_record) text_embedding = concatenate([text_embedding, repeated_record], 2) pre_dense, _, _ = rnn(text_embedding) # Adding the final dense layer print(pre_dense.shape) output = Dense(units=output_size, activation=activation)(pre_dense) # Putting everything together model = Model([input_record, input_text], output) return model
def fn(x, L_acc, LT_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)], x) diag = K.exp(T.diag(x_)) + K.epsilon() x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], diag) return x_, x_.T
def get_updates(self, loss1, loss2, loss3, loss4, loss5, loss6, params): grads1 = self.get_gradients(loss1, params) grads2 = self.get_gradients(loss2, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) c1 = self.descent_weight1 c2 = self.descent_weight2 ## for split and not multi specify the splitted weighting c11 = c1 # for CE dense c21 = c2 # for l1 dense c12 = 1 # for CE conv c22 = 4e-1 # for l2 dense if self.multi and not self.split: # calculate weighting for the loss functions given (should be default) zero = K.variable(0, name='zero') one = K.variable(1, name='one') flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList2 = [K.flatten(x) for x in grads2] gradients2 = K.concatenate(flattenedList2) grad21 = gradients2 - gradients1 grad12 = gradients1 - gradients2 z1 = K.sum(grad21 * gradients2) z2 = K.sum(grad12 * gradients1) n = K.sum(grad21 * grad21) cm1 = z1 / n c1 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c2 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype=bool)), lambda: zero, lambda: cm2) (c1, c2) = K.switch(c1 < 0, lambda: (zero, one), lambda: (c1, c2)) (c2, c1) = K.switch(c2 < 0, lambda: (zero, one), lambda: (c2, c1)) if self.split and self.multi: # calculate weighting for the loss1 given but split in conv/dense and use different loss2 (namely split loss 2 in loss5 and loss6) zero = K.variable(0, name='zero') one = K.variable(1, name='one') grads5 = self.get_gradients(loss5, params) # l1 loss dense grads6= self.get_gradients(loss6, params) # l2 loss conv flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList5 = [K.flatten(x) for x in grads5] gradients5 = K.concatenate(flattenedList5) flattenedList6 = [K.flatten(x) for x in grads6] gradients6 = K.concatenate(flattenedList6) grad51 = gradients5 - gradients1 grad15 = gradients1 - gradients5 z1 = K.sum(grad51 * gradients5) z2 = K.sum(grad15 * gradients1) n = K.sum(grad51 * grad51) cm1 = z1 / n c11 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c21 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype =bool)),lambda: zero, lambda: cm2) (c11, c21) = K.switch(c11 < 0, lambda: (zero, one), lambda: (c11, c21)) (c21, c11) = K.switch(c21 < 0, lambda: (zero, one), lambda: (c21, c11)) grad61 = gradients6 - gradients1 grad16 = gradients1 - gradients6 z1 = K.sum(grad61 * gradients6) z2 = K.sum(grad16 * gradients1) n = K.sum(grad61 * grad61) cm1 = z1 / n c12 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) # for CE conv cm2 = z2 / n c22 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype =bool)), lambda: zero, lambda: cm2) # for l2 conv (c12, c22) = K.switch(c12 < 0, lambda: (zero, one), lambda: (c12, c22)) (c22, c12) = K.switch(c22 < 0, lambda: (zero, one), lambda: (c22, c12)) c1= c11 # for CE dense c2= c21 # for l1 dense t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms1 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params)] ms2 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params)] ms6 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params)] vs1 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params)] vs2 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params)] vs6 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params)] if self.amsgrad: vhats1 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params)] vhats2 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params)] vhats6 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params)] else: vhats1 = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))] vhats2 = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))] vhats6 = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))] self.weights = [self.iterations] + ms1 + vs1 + vhats1 if not self.split: #grads1,2 for p, g1,g2, m1,v1, vhat1,m2,v2, vhat2 in zip(params, grads1, grads2, ms1, vs1, vhats1, ms2, vs2, vhats2): m_t1 = (self.beta_1 * m1) + (1. - self.beta_1) * g1 m_t2 = (self.beta_1 * m2) + (1. - self.beta_1) * g2 v_t1 = (self.beta_2 * v1) + (1. - self.beta_2) * K.square(g1) v_t2 = (self.beta_2 * v2) + (1. - self.beta_2) * K.square(g2) if self.amsgrad: vhat_t1 = K.maximum(vhat1, v_t1) vhat_t2= K.maximum(vhat2, v_t2) p_t = p - lr_t * (c1*(m_t1 / (K.sqrt(vhat_t1) + self.epsilon))+c2*(m_t2 / (K.sqrt(vhat_t2) + self.epsilon))) self.updates.append(K.update(vhat1, vhat_t1)) self.updates.append(K.update(vhat2, vhat_t2)) else: p_t = p - lr_t * (c1*(m_t1 / (K.sqrt(v_t1) + self.epsilon))+c2*(m_t2 / (K.sqrt(v_t2) + self.epsilon))) self.updates.append(K.update(m1, m_t1)) self.updates.append(K.update(m2, m_t2)) self.updates.append(K.update(v1, v_t1)) self.updates.append(K.update(v2, v_t2)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) else: #grads 1,5,6 for p, g1, g5, g6, m1,v1, vhat1,m5,v5, vhat5, m6,v6, vhat6 in zip(params, grads1, grads5, grads6, ms1, vs1, vhats1, ms2, vs2, vhats2, ms6, vs6, vhats6): m_t1 = (self.beta_1 * m1) + (1. - self.beta_1) * g1 m_t5 = (self.beta_1 * m5) + (1. - self.beta_1) * g5 m_t6 = (self.beta_1 * m5) + (1. - self.beta_1) * g6 v_t1 = (self.beta_2 * v1) + (1. - self.beta_2) * K.square(g1) v_t5 = (self.beta_2 * v5) + (1. - self.beta_2) * K.square(g5) v_t6 = (self.beta_2 * v6) + (1. - self.beta_2) * K.square(g6) if g6 == 0: # its a dense param if self.amsgrad: vhat_t1 = K.maximum(vhat1, v_t1) vhat_t5= K.maximum(vhat5, v_t5) p_t = p - lr_t * (c11*(m_t1 / (K.sqrt(vhat_t1) + self.epsilon))+c21*(m_t5 / (K.sqrt(vhat_t5)+ self.epsilon))) self.updates.append(K.update(vhat1, vhat_t1)) self.updates.append(K.update(vhat5, vhat_t5)) else: p_t = p - lr_t * (c11*(m_t1 / (K.sqrt(v_t1) + self.epsilon))+c21*(m_t5 / (K.sqrt(v_t5)+ self.epsilon))) self.updates.append(K.update(m1, m_t1)) self.updates.append(K.update(v1, v_t1)) self.updates.append(K.update(m5, m_t5)) self.updates.append(K.update(v5, v_t5)) new_p = p_t else: # its a conv param if self.amsgrad: vhat_t1 = K.maximum(vhat1, v_t1) vhat_t6= K.maximum(vhat6, v_t6) p_t = p - lr_t * (c12*(m_t1 / (K.sqrt(vhat_t1) + self.epsilon))+c22*(m_t6 / (K.sqrt(vhat_t6) + self.epsilon))) self.updates.append(K.update(vhat1, vhat_t1)) self.updates.append(K.update(vhat6, vhat_t6)) else: p_t = p - lr_t * (c12*(m_t1 / (K.sqrt(v_t1) + self.epsilon))+c22*(m_t6 / (K.sqrt(v_t6) + self.epsilon))) self.updates.append(K.update(m1, m_t1)) self.updates.append(K.update(v1, v_t1)) self.updates.append(K.update(m6, m_t6)) self.updates.append(K.update(v6, v_t6)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates,c1,c2
def reset_states(self, states=None): if not self.stateful: raise AttributeError('Layer must be stateful.') input_shape = self.input_spec[0].shape state_shape = self.compute_output_shape(input_shape) if self.return_state: state_shape = state_shape[0] if self.return_sequences: state_shape = state_shape[:1].concatenate(state_shape[2:]) if None in state_shape: raise ValueError('If a RNN is stateful, it needs to know ' 'its batch size. Specify the batch size ' 'of your input tensors: \n' '- If using a Sequential model, ' 'specify the batch size by passing ' 'a `batch_input_shape` ' 'argument to your first layer.\n' '- If using the functional API, specify ' 'the time dimension by passing a ' '`batch_shape` argument to your Input layer.\n' 'The same thing goes for the number of rows and ' 'columns.') # helper function def get_tuple_shape(nb_channels): result = list(state_shape) if self.cell.data_format == 'channels_first': result[1] = nb_channels elif self.cell.data_format == 'channels_last': result[3] = nb_channels else: raise KeyError return tuple(result) # initialize state if None if self.states[0] is None: if hasattr(self.cell.state_size, '__len__'): self.states = [ K.zeros(get_tuple_shape(dim)) for dim in self.cell.state_size ] else: self.states = [K.zeros(get_tuple_shape(self.cell.state_size))] elif states is None: if hasattr(self.cell.state_size, '__len__'): for state, dim in zip(self.states, self.cell.state_size): K.set_value(state, np.zeros(get_tuple_shape(dim))) else: K.set_value(self.states[0], np.zeros(get_tuple_shape(self.cell.state_size))) else: if not isinstance(states, (list, tuple)): states = [states] if len(states) != len(self.states): raise ValueError('Layer ' + self.name + ' expects ' + str(len(self.states)) + ' states, ' + 'but it received ' + str(len(states)) + ' state values. Input received: ' + str(states)) for index, (value, state) in enumerate(zip(states, self.states)): if hasattr(self.cell.state_size, '__len__'): dim = self.cell.state_size[index] else: dim = self.cell.state_size if value.shape != get_tuple_shape(dim): raise ValueError('State ' + str(index) + ' is incompatible with layer ' + self.name + ': expected shape=' + str(get_tuple_shape(dim)) + ', found shape=' + str(value.shape)) # TODO(anjalisridhar): consider batch calls to `set_value`. K.set_value(state, value)
def build(self, input_shape): #assert self.output_dim == input_shape[-1] self.input_spec = [InputSpec(shape=input_shape)] self.middle_length = input_shape[2] input_dim = input_shape[3] # Attention self.W_a = self.init((input_dim + self.output_dim, self.output_dim), name='{}_W_a'.format(self.name)) self.b_a = K.zeros((self.output_dim, ), name='{}_b_a'.format(self.name)) # Regular LSTM self.input_dim = input_dim if self.stateful: self.reset_states() else: # initial states: 2 all-zero tensors of shape (output_dim) self.states = [None, None] self.W_i = self.init((input_dim, self.output_dim), name='{}_W_i'.format(self.name)) self.U_i = self.inner_init((self.output_dim, self.output_dim), name='{}_U_i'.format(self.name)) self.b_i = K.zeros((self.output_dim, ), name='{}_b_i'.format(self.name)) self.W_f = self.init((input_dim, self.output_dim), name='{}_W_f'.format(self.name)) self.U_f = self.inner_init((self.output_dim, self.output_dim), name='{}_U_f'.format(self.name)) self.b_f = self.forget_bias_init((self.output_dim, ), name='{}_b_f'.format(self.name)) self.W_c = self.init((input_dim, self.output_dim), name='{}_W_c'.format(self.name)) self.U_c = self.inner_init((self.output_dim, self.output_dim), name='{}_U_c'.format(self.name)) self.b_c = K.zeros((self.output_dim, ), name='{}_b_c'.format(self.name)) self.W_o = self.init((input_dim, self.output_dim), name='{}_W_o'.format(self.name)) self.U_o = self.inner_init((self.output_dim, self.output_dim), name='{}_U_o'.format(self.name)) self.b_o = K.zeros((self.output_dim, ), name='{}_b_o'.format(self.name)) self.regularizers = [] if self.W_regularizer: self.W_regularizer.set_param( K.concatenate([self.W_i, self.W_f, self.W_c, self.W_o])) self.regularizers.append(self.W_regularizer) if self.U_regularizer: self.U_regularizer.set_param( K.concatenate([self.U_i, self.U_f, self.U_c, self.U_o])) self.regularizers.append(self.U_regularizer) if self.b_regularizer: self.b_regularizer.set_param( K.concatenate([self.b_i, self.b_f, self.b_c, self.b_o])) self.regularizers.append(self.b_regularizer) self.trainable_weights = [ self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.W_a, self.b_a ] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [ mult for mult in self.multipliers if mult in p.name ] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format( multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def fn(x, P_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], x) return x_
def call(self, x, mask=None): # TODO: validate input shape # The input of this layer is [L, mu, a] in concatenated form. We first split # those up. idx = 0 if self.mode == 'full': L_flat = x[:, idx:idx + (self.nb_actions * self.nb_actions + self.nb_actions) // 2] idx += (self.nb_actions * self.nb_actions + self.nb_actions) // 2 elif self.mode == 'diag': L_flat = x[:, idx:idx + self.nb_actions] idx += self.nb_actions else: L_flat = None assert L_flat is not None mu = x[:, idx:idx + self.nb_actions] idx += self.nb_actions a = x[:, idx:idx + self.nb_actions] idx += self.nb_actions if self.mode == 'full': # Create L and L^T matrix, which we use to construct the positive-definite matrix P. L = None LT = None if K._BACKEND == 'theano': import theano.tensor as T import theano def fn(x, L_acc, LT_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)], x) diag = K.exp(T.diag(x_) + K.epsilon()) x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], diag) return x_, x_.T outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] results, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) L, LT = results elif K._BACKEND == 'tensorflow': import tensorflow as tf # Number of elements in a triangular matrix. nb_elems = (self.nb_actions * self.nb_actions + self.nb_actions) // 2 # Create mask for the diagonal elements in L_flat. This is used to exponentiate # only the diagonal elements, which is done before gathering. diag_indeces = [0] for row in range(1, self.nb_actions): diag_indeces.append(diag_indeces[-1] + (row + 1)) diag_mask = np.zeros(1 + nb_elems) # +1 for the leading zero diag_mask[np.array(diag_indeces) + 1] = 1 diag_mask = K.variable(diag_mask) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1) L_flat = tf.concat(1, [zeros, L_flat]) # Create mask that can be used to gather elements from L_flat and put them # into a lower triangular matrix. tril_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') tril_mask[np.tril_indices(self.nb_actions)] = range( 1, nb_elems + 1) # Finally, process each element of the batch. init = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] def fn(a, x): # Exponentiate everything. This is much easier than only exponentiating # the diagonal elements, and, usually, the action space is relatively low. x_ = K.exp(x + K.epsilon()) # Only keep the diagonal elements. x_ *= diag_mask # Add the original, non-diagonal elements. x_ += x * (1. - diag_mask) # Finally, gather everything into a lower triangular matrix. L_ = tf.gather(x_, tril_mask) return [L_, tf.transpose(L_)] tmp = tf.scan(fn, L_flat, initializer=init) if isinstance(tmp, (list, tuple)): # TensorFlow 0.10 now returns a tuple of tensors. L, LT = tmp else: # Old TensorFlow < 0.10 returns a shared tensor. L = tmp[:, 0, :, :] LT = tmp[:, 1, :, :] else: raise RuntimeError('Unknown Keras backend "{}".'.format( K._BACKEND)) assert L is not None assert LT is not None P = K.batch_dot(L, LT) elif self.mode == 'diag': if K._BACKEND == 'theano': import theano.tensor as T import theano def fn(x, P_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], x) return x_ outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), ] P, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) elif K._BACKEND == 'tensorflow': import tensorflow as tf # Create mask that can be used to gather elements from L_flat and put them # into a diagonal matrix. diag_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') diag_mask[np.diag_indices(self.nb_actions)] = range( 1, self.nb_actions + 1) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1) L_flat = tf.concat(1, [zeros, L_flat]) # Finally, process each element of the batch. def fn(a, x): x_ = tf.gather(x, diag_mask) return x_ P = tf.scan(fn, L_flat, initializer=K.zeros( (self.nb_actions, self.nb_actions))) else: raise RuntimeError('Unknown Keras backend "{}".'.format( K._BACKEND)) assert P is not None assert K.ndim(P) == 3 # Combine a, mu and P into a scalar (over the batches). What we compute here is # -.5 * (a - mu)^T * P * (a - mu), where * denotes the dot-product. Unfortunately # TensorFlow handles vector * P slightly suboptimal, hence we convert the vectors to # 1xd/dx1 matrices and finally flatten the resulting 1x1 matrix into a scalar. All # operations happen over the batch size, which is dimension 0. prod = K.batch_dot(K.expand_dims(a - mu, dim=1), P) prod = K.batch_dot(prod, K.expand_dims(a - mu, dim=-1)) A = -.5 * K.batch_flatten(prod) assert K.ndim(A) == 2 return A
def _create_all_weights(self, params): shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators return accumulators
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # if a weight tensor (len > 1) use weight normalized parameterization # this is the only part changed w.r.t. keras.optimizers.Adam ps = K.get_variable_shape(p) if len(ps) > 1: # get weight normalization parameters V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads( p, g) # Adam containers for the 'g' parameter V_scaler_shape = K.get_variable_shape(V_scaler) m_g = K.zeros(V_scaler_shape) v_g = K.zeros(V_scaler_shape) # update g parameters m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) self.updates.append(K.update(m_g, m_g_t)) self.updates.append(K.update(v_g, v_g_t)) # update V parameters m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # if there are constraints we apply them to V, not W if p in constraints: c = constraints[p] new_V_param = c(new_V_param) # wn param updates --> W updates add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) else: # do optimization normally m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) self.updates.append(K.update_add(self.iterations, 1)) # momentum shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): # if a weight tensor (len > 1) use weight normalized parameterization ps = K.get_variable_shape(p) if len(ps) > 1: # get weight normalization parameters V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads( p, g) # momentum container for the 'g' parameter V_scaler_shape = K.get_variable_shape(V_scaler) m_g = K.zeros(V_scaler_shape) # update g parameters v_g = self.momentum * m_g - lr * grad_g # velocity self.updates.append(K.update(m_g, v_g)) if self.nesterov: new_g_param = g_param + self.momentum * v_g - lr * grad_g else: new_g_param = g_param + v_g # update V parameters v_v = self.momentum * m - lr * grad_V # velocity self.updates.append(K.update(m, v_v)) if self.nesterov: new_V_param = V + self.momentum * v_v - lr * grad_V else: new_V_param = V + v_v # if there are constraints we apply them to V, not W if p in constraints: c = constraints[p] new_V_param = c(new_V_param) # wn param updates --> W updates add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) else: # normal SGD with momentum v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_initial_state(self, x): input_shape = self.input_spec[0].shape init_nb_row = input_shape[self.row_axis] init_nb_col = input_shape[self.column_axis] base_initial_state = K.zeros_like(x) # (samples, timesteps) + image_shape non_channel_axis = -1 if self.data_format == 'channels_first' else -2 for _ in range(2): base_initial_state = K.sum(base_initial_state, axis=non_channel_axis) base_initial_state = K.sum(base_initial_state, axis=1) # (samples, nb_channels) initial_states = [] states_to_pass = ['r', 'c', 'e'] nlayers_to_pass = {u: self.nb_layers for u in states_to_pass} if self.extrap_start_time is not None: states_to_pass.append('ahat') # pass prediction in states so can use as actual for t+1 when extrapolating nlayers_to_pass['ahat'] = 1 for u in states_to_pass: for l in range(nlayers_to_pass[u]): ds_factor = 2 ** l nb_row = init_nb_row // ds_factor nb_col = init_nb_col // ds_factor if u in ['r', 'c']: stack_size = self.R_stack_sizes[l] elif u == 'e': stack_size = 2 * self.stack_sizes[l] elif u == 'ahat': stack_size = self.stack_sizes[l] output_size = stack_size * nb_row * nb_col # flattened size reducer = K.zeros((input_shape[self.channel_axis], output_size)) # (nb_channels, output_size) initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) if self.data_format == 'channels_first': output_shp = (-1, stack_size, nb_row, nb_col) else: output_shp = (-1, nb_row, nb_col, stack_size) initial_state = K.reshape(initial_state, output_shp) initial_states += [initial_state] if(self.multi_task_train): # encoder level 0 output_size = self.lbl_pred_chns[0] * 1 * 1 # flattened size reducer = K.zeros((input_shape[self.channel_axis], output_size)) initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) output_shp = (-1, 1, 1, self.lbl_pred_chns[0]) ### Hardcoded for only 'channel_last' initial_state = K.reshape(initial_state, output_shp) initial_states += [initial_state] # encoder level 1 output_size = self.nb_classes * 1 * 1 # flattened size reducer = K.zeros((input_shape[self.channel_axis], output_size)) initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) output_shp = (-1, 1, 1, self.nb_classes) ### Hardcoded for only 'channel_last' initial_state = K.reshape(initial_state, output_shp) initial_states += [initial_state] if K._BACKEND == 'theano': from theano import tensor as T # There is a known issue in the Theano scan op when dealing with inputs whose shape is 1 along a dimension. # In our case, this is a problem when training on grayscale images, and the below line fixes it. initial_states = [T.unbroadcast(init_state, 0, 1) for init_state in initial_states] if self.extrap_start_time is not None: initial_states += [K.variable(0, int if K.backend() != 'tensorflow' else 'int32')] # the last state will correspond to the current timestep return initial_states
num_epochs = 10 # defining the learning rate lr = 0.1 # building the model # defining the placeholders to feed the input and target data input_tensor = K.placeholder(shape=(batch_size, input_dim), dtype='float32') target_tensor = K.placeholder(shape=(batch_size, 1), dtype='float32') # defining the weight and the bias variables weight_variable = K.random_uniform_variable(shape=(input_dim, 1), low=-1., high=1., dtype='float32') bias_variable = K.zeros(shape=(1, ), dtype='float32') # defining the sigmoid output tensor output_tensor = K.dot(input_tensor, weight_variable) + bias_variable output_tensor = K.sigmoid(output_tensor) # defining the mean loss tensor loss_tensor = K.mean(K.binary_crossentropy(target_tensor, output_tensor)) # getting the gradients of the mean loss with respect to the weight and bias gradient_tensors = K.gradients(loss=loss_tensor, variables= [weight_variable, bias_variable]) # creating the updates based on stochastic gradient descent rule updates = [(weight_variable, weight_variable - lr * gradient_tensors[0]),
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr completed_updates = K.cast( K.tf.floordiv(self.iterations, self.accum_iters), K.floatx()) if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * completed_updates)) t = completed_updates + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # self.iterations incremented after processing a batch # batch: 1 2 3 4 5 6 7 8 9 # self.iterations: 0 1 2 3 4 5 6 7 8 # update_switch = 1: x x (if accum_iters=4) update_switch = K.equal((self.iterations + 1) % self.accum_iters, 0) update_switch = K.cast(update_switch, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat, tg in zip(params, grads, ms, vs, vhats, gs): sum_grad = tg + g avg_grad = sum_grad / self.accum_iters_float m_t = (self.beta_1 * m) + (1. - self.beta_1) * avg_grad v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(avg_grad) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append( K.update(vhat, (1 - update_switch) * vhat + update_switch * vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append( K.update(m, (1 - update_switch) * m + update_switch * m_t)) self.updates.append( K.update(v, (1 - update_switch) * v + update_switch * v_t)) self.updates.append(K.update(tg, (1 - update_switch) * sum_grad)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append( K.update(p, (1 - update_switch) * p + update_switch * new_p)) return self.updates
def get_updates(self, loss1, loss2, loss3, loss4, loss5, loss6, params): grads1 = self.get_gradients(loss1, params) grads2= self.get_gradients(loss2, params) accumulators1 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accumulator_' + str(i)) for (i, p) in enumerate(params)] accumulators2 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accumulator_' + str(i)) for (i, p) in enumerate(params)] accumulators6 = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accumulator_' + str(i)) for (i, p) in enumerate(params)] self.weights = [self.iterations] + accumulators1 self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) c1 = self.descent_weight1 c2 = self.descent_weight2 ## for split and not multi specify the splitted weighting c11 = c1 # for CE dense c21 = c2 # for l1 dense c12 = 1 # for CE conv c22 = 4e-1 # for l2 dense if self.multi and not self.split: # calculate weighting for the loss functions given (should be default) zero = K.variable(0, name='zero') one = K.variable(1, name='one') flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList2 = [K.flatten(x) for x in grads2] gradients2 = K.concatenate(flattenedList2) grad21 = gradients2 - gradients1 grad12 = gradients1 - gradients2 z1 = K.sum(grad21 * gradients2) z2 = K.sum(grad12 * gradients1) n = K.sum(grad21 * grad21) cm1 = z1 / n c1 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c2 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype=bool)), lambda: zero, lambda: cm2) (c1, c2) = K.switch(c1 < 0, lambda: (zero, one), lambda: (c1, c2)) (c2, c1) = K.switch(c2 < 0, lambda: (zero, one), lambda: (c2, c1)) if self.split and self.multi: # calculate weighting for the loss1 given but split in conv/dense and use different loss2 (namely split loss 2 in loss5 and loss6) zero = K.variable(0, name='zero') one = K.variable(1, name='one') grads5 = self.get_gradients(loss5, params) # l1 loss dense grads6= self.get_gradients(loss6, params) # l2 loss conv flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList5 = [K.flatten(x) for x in grads5] gradients5 = K.concatenate(flattenedList5) flattenedList6 = [K.flatten(x) for x in grads6] gradients6 = K.concatenate(flattenedList6) grad51 = gradients5 - gradients1 grad15 = gradients1 - gradients5 z1 = K.sum(grad51 * gradients5) z2 = K.sum(grad15 * gradients1) n = K.sum(grad51 * grad51) cm1 = z1 / n c11 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c21 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype =bool)),lambda: zero, lambda: cm2) (c11, c21) = K.switch(c11 < 0, lambda: (zero, one), lambda: (c11, c21)) (c21, c11) = K.switch(c21 < 0, lambda: (zero, one), lambda: (c21, c11)) grad61 = gradients6 - gradients1 grad16 = gradients1 - gradients6 z1 = K.sum(grad61 * gradients6) z2 = K.sum(grad16 * gradients1) n = K.sum(grad61 * grad61) cm1 = z1 / n c12 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) # for CE conv cm2 = z2 / n c22 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype =bool)), lambda: zero, lambda: cm2) # for l2 conv (c12, c22) = K.switch(c12 < 0, lambda: (zero, one), lambda: (c12, c22)) (c22, c12) = K.switch(c22 < 0, lambda: (zero, one), lambda: (c22, c12)) c1= c11 # for CE dense c2= c21 # for l1 dense if not self.split: #grads1,2 for p, g1, g2, a1,a2 in zip(params, grads1, grads2, accumulators1, accumulators2): # update accumulator new_a1 = self.rho * a1 + (1. - self.rho) * K.square(g1) new_a2 = self.rho * a2 + (1. - self.rho) * K.square(g2) self.updates.append(K.update(a1, new_a1)) self.updates.append(K.update(a2, new_a2)) new_p = p - lr *( c1*(g1 / (K.sqrt(new_a1) + self.epsilon))+c2*(g2/ (K.sqrt(new_a2) + self.epsilon))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) else: #grads 1,5,6 for p, g1, g5, g6, a1,a5, a6 in zip(params, grads1, grads5, grads6, accumulators1, accumulators2, accumulators6): if g6 == 0: # its a dense param # update accumulator new_a1 = self.rho * a1 + (1. - self.rho) * K.square(g1) new_a5 = self.rho * a5 + (1. - self.rho) * K.square(g5) self.updates.append(K.update(a1, new_a1)) self.updates.append(K.update(a5, new_a5)) new_p = p - lr *( c11*(g1 / (K.sqrt(new_a1) + self.epsilon))+c21*(g5/ (K.sqrt(new_a5) + self.epsilon))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) else: # its a conv param # update accumulator new_a1 = self.rho * a1 + (1. - self.rho) * K.square(g1) new_a6 = self.rho * a6 + (1. - self.rho) * K.square(g6) self.updates.append(K.update(a1, new_a1)) self.updates.append(K.update(a6, new_a6)) new_p = p - lr *( c12*(g1 / (K.sqrt(new_a1) + self.epsilon))+c22*(g6/ (K.sqrt(new_a6) + self.epsilon))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates,c1,c2
X = np.asarray(X, dtype=np.float32) Y = np.asarray(Y, dtype=np.float32) return X, Y N = 100 X_train, Y_train = data_construction(N, length=6, size=5, end_marker=True) np.set_printoptions(precision=3) model = 'D' ## A if model=='A': controller_input = Input(shape=(14,12),name='New_Input') MEMORY = Lambda(lambda x: K.zeros(shape=(1,120,40)),name='Memory_0')(controller_input) usage_weights = Lambda(lambda x: K.zeros(shape=(1,1,120)),name='Usage_Weights_0')(controller_input) read_weights = Lambda(lambda x: K.zeros(shape=(1,14,120)),name='Read_Weights_0')(controller_input) controller = LSTM(units=200, activation='tanh',stateful=False, return_sequences=True,name='LSTM_CONTROLLER')(controller_input) write_keys = Dense(40, activation='tanh',name='Write_Keys')(controller) read_keys = Dense(40, activation='tanh',name='Read_Keys')(controller) omegas = Dense(1, activation='sigmoid',name='Omegas')(controller) least_usage = Lambda(lambda x: K.one_hot(indices=K.argmax(-x),num_classes=120),name='Least_Usage')(usage_weights) omegas_tiled = Lambda(lambda x: K.tile(x,(1,1,120)))(omegas) compl_omegas = Lambda(lambda o: K.ones(shape=(14,120)) - o)(omegas_tiled) rd_part = Multiply()([omegas_tiled, read_weights]) us_part = Multiply()([compl_omegas, least_usage]) write_weights = Add(name='Write_Weights')([rd_part,us_part]) writing = Dot(axes=[1,1])([write_weights, write_keys]) MEMORY = Add(name='Memory')([MEMORY, writing])
def get_updates(self, loss1, loss2, loss3, loss4, loss5, loss6, params): grads1 = self.get_gradients(loss1, params) grads2 = self.get_gradients(loss2, params) grads5 = self.get_gradients(loss5, params) # l1 loss dense grads6= self.get_gradients(loss6, params) # l2 loss conv self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) c1 = self.descent_weight1 c2 = self.descent_weight2 ## for split and without multi specify the splitted weighting c11 = c1 # for CE dense c21 = c2 # for l1 dense c12 = 1 # for CE conv c22 = 4e-1 # for l2 dense if self.multi and not self.split: # calculate weighting for the loss functions given (default, also in the paper) zero = K.variable(0, name='zero') one = K.variable(1, name='one') flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList2 = [K.flatten(x) for x in grads2] gradients2 = K.concatenate(flattenedList2) grad21 = gradients2 - gradients1 grad12 = gradients1 - gradients2 z1 = K.sum(grad21 * gradients2) z2 = K.sum(grad12 * gradients1) n = K.sum(grad21 * grad21) cm1 = z1 / n c1 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c2 = K.switch(K.equal(K.all(K.equal(gradients1, gradients2)), K.constant(True, dtype =bool)),lambda: zero, lambda: cm2) (c1, c2) = K.switch(c1 < 0, lambda: (zero, one), lambda: (c1, c2)) (c2, c1) = K.switch(c2 < 0, lambda: (zero, one), lambda: (c2, c1)) if self.split and self.multi: # calculate weighting for the loss1 given but split in conv/dense and use different loss2 (namely split loss 2 in loss5 and loss6) zero = K.variable(0, name='zero') one = K.variable(1, name='one') flattenedList1 = [K.flatten(x) for x in grads1] gradients1 = K.concatenate(flattenedList1) flattenedList5 = [K.flatten(x) for x in grads5] gradients5 = K.concatenate(flattenedList5) flattenedList6 = [K.flatten(x) for x in grads6] gradients6 = K.concatenate(flattenedList6) grad51 = gradients5 - gradients1 grad15 = gradients1 - gradients5 z1 = K.sum(grad51 * gradients5) z2 = K.sum(grad15 * gradients1) n = K.sum(grad51 * grad51) cm1 = z1 / n c11 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) cm2 = z2 / n c21 = K.switch(K.equal(K.all(K.equal(gradients1, gradients5)), K.constant(True, dtype =bool)),lambda: zero, lambda: cm2) (c11, c21) = K.switch(c11 < 0, lambda: (zero, one), lambda: (c11, c21)) (c21, c11) = K.switch(c21 < 0, lambda: (zero, one), lambda: (c21, c11)) grad61 = gradients6 - gradients1 grad16 = gradients1 - gradients6 z1 = K.sum(grad61 * gradients6) z2 = K.sum(grad16 * gradients1) n = K.sum(grad61 * grad61) cm1 = z1 / n c12 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype=bool)), lambda: one, lambda: cm1) # for CE conv cm2 = z2 / n c22 = K.switch(K.equal(K.all(K.equal(gradients1, gradients6)), K.constant(True, dtype =bool)), lambda: zero, lambda: cm2) # for l2 conv (c12, c22) = K.switch(c12 < 0, lambda: (zero, one), lambda: (c12, c22)) (c22, c12) = K.switch(c22 < 0, lambda: (zero, one), lambda: (c22, c12)) c1= c11 # for CE dense c2= c21 # for l1 dense # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes)] self.weights = [self.iterations] + moments if not self.split: for p, g1, g2, m in zip(params, grads1, grads2, moments): v = self.momentum * m - lr*(c1*g1+c2*g2) # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr*(c1*g1+c2*g2) else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) else: for p, g1, g5, g6, m in zip(params, grads1, grads5, grads6, moments): if g6 == 0: # its a dense param v = self.momentum * m - lr*(c11*g1+ c21*g5) # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr*(c11*g1+ c21*g5) else: new_p = p + v else: # its a conv param v = self.momentum * m - lr*(c12*g1+ c22*g6) # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr*(c12*g1+ c22*g6) else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self.c1=c1 self.c2=c2 return self.updates, c1, c2
def _create_all_weights(self, params): accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = accumulators return accumulators
return (np.expand_dims(style_mask, axis=0), np.expand_dims(target_mask, axis=0)) # Create tensor variables for images if K.image_data_format() == 'channels_first': shape = (1, num_colors, img_nrows, img_ncols) else: shape = (1, img_nrows, img_ncols, num_colors) style_image = K.variable(preprocess_image(style_img_path)) target_image = K.placeholder(shape=shape) if use_content_img: content_image = K.variable(preprocess_image(content_img_path)) else: content_image = K.zeros(shape=shape) images = K.concatenate([style_image, target_image, content_image], axis=0) # Create tensor variables for masks raw_style_mask, raw_target_mask = load_mask_labels() style_mask = K.variable(raw_style_mask.astype('float32')) target_mask = K.variable(raw_target_mask.astype('float32')) masks = K.concatenate([style_mask, target_mask], axis=0) # index constants for images and tasks variables STYLE, TARGET, CONTENT = 0, 1, 2 # Build image model, mask model and use layer outputs as features # image model as VGG19 image_model = vgg19.VGG19(include_top=False, input_tensor=images)
def add_zero(x): xc = K.zeros((batch_size * h * w, 1)) x = K.concatenate([x, xc], axis=1) return x
def __init__(self, model, momentum=0.9999): self.momentum = momentum self.model = model self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights]
# please provide the test_function which takes in the input and target , and # outputs a tuple of (accuracy, prediction) input_tensor = K.placeholder(shape=(batch_size, input_dim), dtype='float32') hidden_tensor = input_tensor target_tensor = K.placeholder(shape=(batch_size, 10), dtype='float32') weight_variable_list = [] bias_variable_list = [] for i in xrange(num_layers): weight_variable = K.random_uniform_variable(shape=(input_dim, num_units[i]), low=-1., high=1., dtype='float32') bias_variable = K.zeros(shape=(num_units[i], ), dtype='float32') weight_variable_list.append(weight_variable) bias_variable_list.append(bias_variable) hidden_layer_tensor = K.dot(hidden_tensor, weight_variable) + bias_variable hidden_layer_tensor = K.relu(hidden_layer_tensor) hidden_tensor = hidden_layer_tensor input_dim = num_units[i] weight_variable = K.random_uniform_variable(shape=(input_dim, 10), low=-1., high=1., dtype='float32') bias_variable = K.zeros(shape=(10, ), dtype='float32')
# returns reconstructions of the dataset X as computed by the model num_batches = (X.shape[0] - 1) // batch_size + 1 predictions = np.zeros((num_batches * batch_size, v_dim)) for batch_num in range(num_batches): predictions[batch_slice(batch_num)] = predict_func(get_batch(X, batch_num)) return predictions # load and preprocess data (X_train, y_train), (X_valid, y_valid) = mnist.load_data() X_train, X_valid = preprocess(X_train), preprocess(X_valid) v_dim = X_train.shape[-1] # build the parameters of the RBM W = glorot_normal(shape=(v_dim, h_dim), name='W') a = K.zeros(shape=(v_dim,), name='a') b = K.zeros(shape=(h_dim,), name='b') params = [W, a, b] # now build the model # first build visible input and map to hidden state probabilities v = K.placeholder(ndim=2) p_h = K.sigmoid(K.dot(v, W) + b) # now monte carlo sample a few hs from p_h and map back to p(v|h) then average p_v = 0 for i in range(n_monte_carlo): h = sample_bernoulli(p_h) p_v += K.sigmoid(K.dot(h, W.T) + a) p_v = clip(p_v / n_monte_carlo)
def build(self, input, neighbour=None): shape = neighbour.shape return K.zeros(shape)
def _create_all_weights(self, params): shapes = [backend.int_shape(p) for p in params] moments = [backend.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments return moments
def _create_all_weights(self, params): shapes = [backend.int_shape(p) for p in params] accumulators = [backend.zeros(shape) for shape in shapes] delta_accumulators = [backend.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators return accumulators, delta_accumulators
def mask(x): shape = K.shape(x) mask = K.zeros((shape[1], shape[2])) + (-1e15) mask = tf.matrix_band_part(mask, 0, -1) # upper triangle of `mask` mask -= tf.matrix_band_part(mask, 0, 0) # remove diagonal return x + mask
def _allocate_var(self, name=None): return {w: K.zeros(w.get_shape(), name=name) for w in self.weights}
def hack_loss(y_true, y_pred): return K.zeros((1,))
def no_loss(self, y_true, y_pred): return K.zeros(shape=(1, ))