def call(self, inputs, reverse=False, ddi=False, **kwargs): logscale_factor = 3. x = inputs reduce_axis = list(range(K.ndim(inputs)))[:-1] if not reverse: log_scale = self.log_scale bias = self.bias if ddi: x_var = tf.reduce_mean(x**2, reduce_axis, keepdims=True) init_scale = tf.log(1. / (tf.sqrt(x_var) + 1e-6)) / logscale_factor init_bias = tf.reduce_mean(x, reduce_axis, keepdims=True) log_scale = K.switch(K.all(K.equal(self.log_scale, 0.)), init_scale, self.log_scale) bias = K.switch(K.all(K.equal(self.bias, 0.)), -init_bias, self.bias) self.add_update(K.update_add( self.log_scale, K.switch(K.all(K.equal(self.log_scale, 0.)), init_scale, K.zeros_like(init_scale))), inputs=x) self.add_update(K.update_add( self.bias, K.switch(K.all(K.equal(self.bias, 0.)), -init_bias, K.zeros_like(init_bias))), inputs=x) return (x + bias) * K.exp(log_scale) else: return x / K.exp(self.log_scale) - self.bias
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [ mult for mult in self.multipliers if mult in p.name ] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format( multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] shapes = [K.int_shape(p) for p in params] prev_grads = [ K.zeros(shape, name='prev_grad_' + str(i)) for (i, shape) in enumerate(shapes) ] ds = [ K.zeros(shape, name='d_' + str(i)) for (i, shape) in enumerate(shapes) ] vs = [ K.zeros(shape, name='v_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + ds + vs + prev_grads for p, g, pg, v, d in zip(params, grads, prev_grads, vs, ds): v_t = self.momentum * v - self.lr * g self.updates.append(K.update(v, v_t)) d_t = self.momentum * d + (1 - self.momentum) * (g - pg) self.updates.append(K.update(d, d_t)) self.updates.append(K.update(pg, g)) new_p = p + v_t + self.kd * d_t self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): self.updates = [ K.update_add(self.iterations, 1), K.update_add(self.optimizer.iterations, K.constant(self.cond, "int64")) ] # accumulate gradients self.accum_grads = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] grads = self.get_gradients(loss, params) for g, ag in zip(grads, self.accum_grads): self.updates.append(K.update(ag, K.switch(self.cond, ag * 0, ag + g))) self.updates.extend(self.optimizer.get_updates()[1:]) self.weights.extend(self.optimizer.weights) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum( K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): assert params == self.predictions_keras_model.weights wave_function_jacobian_minus_mean = None if not (self.iterative_solver and self.compute_jvp_instead_of_full_jacobian): wave_function_jacobian_minus_mean = self.get_wave_function_jacobian_minus_mean() energy_grad = self.get_energy_grad(loss, wave_function_jacobian_minus_mean) flat_gradient = self.compute_wave_function_gradient_covariance_inverse_multiplication( energy_grad, wave_function_jacobian_minus_mean) self.updates = [K.update_add(self.iterations, 1)] self.updates += self.apply_complex_gradient(flat_gradient * (-1.0 + 0j)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 '''Bias corrections according to the Adam paper ''' lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) '''Schedule multiplier eta_t = 1 for simple AdamW According to the AdamW paper, eta_t can be fixed, decay, or also be used for warm restarts (AdamWR to come). ''' eta_t = 1. p_t = p - eta_t * (multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)) if self.weight_decay != 0: '''Normalized weight decay according to the AdamW paper ''' w_d = self.weight_decay * K.sqrt(self.batch_size / (self.samples_per_epoch * self.epochs)) p_t = p_t - eta_t * (w_d * p) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates_Padam(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] base_lr = self._optimizer.learning_rate if self.initial_decay > 0: base_lr = base_lr * (1. / (1. + self.decay * K.cast( self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = base_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): if self._get_multiplier(p) is None: multiplier = 1.0 else: multiplier = self._get_multiplier(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # Partial momentum adaption. new_p = p - (lr_t * multiplier * (m_t / (denom**(self.partial * 2)))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] shapes = [K.int_shape(p) for p in params] prev_grads = [ K.zeros(shape, name='prev_grad_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + prev_grads for p, g, pg in zip(params, grads, prev_grads): new_p = p - self.lr * g + self.kd * (g - pg) self.updates.append(K.update(pg, g)) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 beta_1_t = K.pow(self.beta_1, t) beta_2_t = K.pow(self.beta_2, t) rho = 2 / (1 - self.beta_2) - 1 rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t) r_t = K.sqrt( K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t)) flag = K.cast(rho_t > 4, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) mhat_t = m_t / (1 - beta_1_t) vhat_t = K.sqrt(v_t / (1 - beta_2_t)) p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag)) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def __call__(self, *args, **kwargs): gs = tf.train.get_global_step() if gs is None: # if not set - create a variable self.global_step = K.variable(tf.zeros(shape=(), dtype=tf.int64), dtype=tf.int64, name="lr_global_step") tf.train.global_step(K.get_session(), self.global_step) gs = K.update_add(self.global_step, 1) ###tf.train.get_global_step() else: self.global_step = gs assert (gs is not None) gstep = tf.cast(gs, dtype=tf.float32) lr_up = K.exp(self.step_accelerate_log * gstep) * self.min_lr lr_down = K.exp(self.step_deccelerate_log * (gstep - self.step_max_lr)) * self.max_lr lr = K.switch(K.less(gs, self.step_max_lr), lr_up, lr_down) if self.tensorboardimage and not self.added_scalar_to_tensorboard: self.tensorboardimage.add_scalar("learning_rate", lr) self.added_scalar_to_tensorboard = True # add once return lr
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] wd = self.wd * self.wd_normalizer # decoupled weight decay (4/6) lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) eta_t = lr / self.init_lr # decoupled weight decay (5/6) with ops.control_dependencies( [state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) """Bias corrections according to the Adam paper.""" lr_t = lr * (K.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) p_t -= eta_t * wd * p # decoupled weight decay (6/6) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] # decoupled weight decay (4/6) wd = self.wd lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # decoupled weight decay (5/6) eta_t = lr / self.init_lr t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) # decoupled weight decay (6/6) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - eta_t * wd * p self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = K.cast(self.iterations, K.floatx()) + 1 lr = K.switch( t <= self.warmup_steps, self.lr * (t / self.warmup_steps), self.min_lr + (self.lr - self.min_lr) * (1.0 - K.minimum(t, self.decay_steps) / self.decay_steps), ) lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_{}'.format(i)) for i, p in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_{}'.format(i)) for i, p in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vh_{}'.format(i)) for i, p in enumerate(params) ] else: vhats = [ K.zeros(1, dtype=K.dtype(p), name='vh_{}'.format(i)) for i, p in enumerate(params) ] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = m_t / (K.sqrt(v_t) + self.epsilon) if self.initial_weight_decay > 0.0: if self.weight_decay_pattern is None: p_t += self.weight_decay * p else: for pattern in self.weight_decay_pattern: if pattern in p.name: p_t += self.weight_decay * p break p_t = p - lr_t * p_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) beta2_t = self.beta_2 ** t N_sma_max = 2 / (1 - self.beta_2) - 1 N_sma = N_sma_max - 2 * t * beta2_t / (1 - beta2_t) # apply weight decay if self.weight_decay != 0.: p_wd = p - self.weight_decay * lr * p else: p_wd = None if p_wd is None: p_ = p else: p_ = p_wd def gt_path(): step_size = lr * K.sqrt( (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - self.beta_1 ** t) denom = K.sqrt(v_t) + self.epsilon p_t = p_ - step_size * (m_t / denom) return p_t def lt_path(): step_size = lr / (1 - self.beta_1 ** t) p_t = p_ - step_size * m_t return p_t p_t = K.switch(N_sma > 5, gt_path, lt_path) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def fit(self, V, verbose=1): """Train RBM with the data V. Parameters ---------- V : 2d numpy array Visible data (batch size x input_dim). verbose : integer Verbose mode (default, 1). """ num_step = V.shape[0] // self.hps['batch_size'] \ if V.shape[0] % self.hps['batch_size'] == 0 else V.shape[0] // self.hps['batch_size'] + 1 # Exception processing? for k in range(self.hps['epochs']): if verbose == 1: print(k + 1, '/', self.hps['epochs'], ' epochs', end='\r') if self.mode == MODE_VISIBLE_BERNOULLI: # Contrastive divergence. v_pos = self.input_visible h_pos = self.transform v_neg = K.cast(K.less( K.random_uniform(shape=(self.hps['batch_size'], V.shape[1])), K.sigmoid( K.dot(h_pos, K.transpose(self.rbm_weight)) + self.visible_bias)), dtype=np.float32) h_neg = K.sigmoid( K.dot(v_neg, self.rbm_weight) + self.hidden_bias) update = K.transpose(K.transpose(K.dot(K.transpose(v_pos), h_pos)) \ - K.dot(K.transpose(h_neg), v_neg)) self.rbm_weight_update_func = K.function( [self.input_visible], [K.update_add(self.rbm_weight, self.hps['lr'] * update)]) self.hidden_bias_update_func = K.function([self.input_visible] , [K.update_add(self.hidden_bias, self.hps['lr'] \ * (K.sum(h_pos, axis=0) - K.sum(h_neg, axis=0)))]) self.visible_bias_update_func = K.function([self.input_visible] , [K.update_add(self.visible_bias, self.hps['lr'] \ * (K.sum(v_pos, axis=0) - K.sum(v_neg, axis=0)))]) # Create the fist visible nodes sampling object. self.sample_first_visible = K.function([self.input_visible], [v_neg]) elif self.mode == MODE_VISIBLE_GAUSSIAN: # Contrastive divergence. v_pos = self.input_visible h_pos = self.transform v_neg = Ke.multivariate_normal_diag( loc=(K.dot(h_pos, K.transpose(self.rbm_weight)) + self.visible_bias), scale_diag=np.ones(shape=(self.hps['batch_size'], V.shape[1]))).sample() h_neg = K.sigmoid( K.dot(v_neg, self.rbm_weight) + self.hidden_bias) update = K.transpose(K.transpose(K.dot(K.transpose(v_pos), h_pos)) \ - K.dot(K.transpose(h_neg), v_neg)) self.rbm_weight_update_func = K.function( [self.input_visible], [K.update_add(self.rbm_weight, self.hps['lr'] * update)]) self.hidden_bias_update_func = K.function([self.input_visible] , [K.update_add(self.hidden_bias, self.hps['lr'] \ * (K.sum(h_pos, axis=0) - K.sum(h_neg, axis=0)))]) self.visible_bias_update_func = K.function([self.input_visible] , [K.update_add(self.visible_bias, self.hps['lr'] \ * (K.sum(v_pos, axis=0) - K.sum(v_neg, axis=0)))]) # Create the fist visible nodes sampling object. self.sample_first_visible = K.function([self.input_visible], [v_neg]) else: pass for i in range(num_step): if i == (num_step - 1): if self.mode == MODE_VISIBLE_BERNOULLI: # Contrastive divergence. v_pos = self.input_visible h_pos = self.transform v_neg = K.cast(K.less( K.random_uniform(shape=( V.shape[0] - int(i * self.hps['batch_size'], V.shape[1]))), K.sigmoid( K.dot(h_pos, K.transpose(self.rbm_weight)) + self.visible_bias)), dtype=np.float32) h_neg = K.sigmoid( K.dot(v_neg, self.rbm_weight) + self.hidden_bias) update = K.transpose(K.transpose(K.dot(K.transpose(v_pos), h_pos)) \ - K.dot(K.transpose(h_neg), v_neg)) self.rbm_weight_update_func = K.function( [self.input_visible], [ K.update_add(self.rbm_weight, self.hps['lr'] * update) ]) self.hidden_bias_update_func = K.function([self.input_visible] , [K.update_add(self.hidden_bias, self.hps['lr'] \ * (K.sum(h_pos, axis=0) - K.sum(h_neg, axis=0)))]) self.visible_bias_update_func = K.function([self.input_visible] , [K.update_add(self.visible_bias, self.hps['lr'] \ * (K.sum(v_pos, axis=0) - K.sum(v_neg, axis=0)))]) # Create the fist visible nodes sampling object. self.sample_first_visible = K.function( [self.input_visible], [v_neg]) elif self.mode == MODE_VISIBLE_GAUSSIAN: # Contrastive divergence. v_pos = self.input_visible h_pos = self.transform v_neg = Ke.multivariate_normal_diag( loc=(K.dot(h_pos, K.transpose(self.rbm_weight)) + self.visible_bias), scale_diag=np.ones(shape=( V.shape[0] - int(i * self.hps['batch_size'], V.shape[1]) ))).sample() h_neg = K.sigmoid( K.dot(v_neg, self.rbm_weight) + self.hidden_bias) update = K.transpose(K.transpose(K.dot(K.transpose(v_pos), h_pos)) \ - K.dot(K.transpose(h_neg), v_neg)) self.rbm_weight_update_func = K.function( [self.input_visible], [ K.update_add(self.rbm_weight, self.hps['lr'] * update) ]) self.hidden_bias_update_func = K.function([self.input_visible] , [K.update_add(self.hidden_bias, self.hps['lr'] \ * (K.sum(h_pos, axis=0) - K.sum(h_neg, axis=0)))]) self.visible_bias_update_func = K.function([self.input_visible] , [K.update_add(self.visible_bias, self.hps['lr'] \ * (K.sum(v_pos, axis=0) - K.sum(v_neg, axis=0)))]) # Create the fist visible nodes sampling object. self.sample_first_visible = K.function( [self.input_visible], [v_neg]) else: pass V_batch = [V[int(i * self.hps['batch_size']):V.shape[0]]] # Train. self.rbm_weight_update_func(V_batch) self.hidden_bias_update_func(V_batch) self.visible_bias_update_func(V_batch) else: V_batch = [ V[int(i * self.hps['batch_size']):int( (i + 1) * self.hps['batch_size'])] ] # Train. self.rbm_weight_update_func(V_batch) self.hidden_bias_update_func(V_batch) self.visible_bias_update_func(V_batch) # Calculate a training score by each step. # Free energy of the input visible nodes. fe = self.cal_free_energy(V_batch) # Free energy of the first sampled visible nodes. V_p_batch = self.sample_first_visible(V_batch) fe_p = self.cal_free_energy(V_p_batch) score = np.mean(np.abs(fe[0] - fe_p[0])) # Scale? print('\n{0:d}/{1:d}, score: {2:f}'.format( i + 1, num_step, score))