def _kl_gumbel_gumbel(dist1, dist2): scale_1d2 = dist1.scale / dist2.scale return exponential.log(dist2.scale) - exponential.log(dist1.scale) \ + EULER * (scale_1d2 - 1.) \ + exponential.exp((dist2.loc - dist1.loc) / dist2.scale + lgamma.lgamma(scale_1d2 + 1.)) \ - 1 + (dist1.loc - dist2.loc) / dist2.scale
def log_prob(self, x): logp = - lgamma.lgamma(self.k) - self.k * exponential.log(self.theta) \ + (self.k - 1) * exponential.log(x) - x / self.theta xp = logp.xp inf = xp.full_like(logp.array, xp.inf) if isinstance(x, chainer.Variable): x = x.array return where.where(xp.asarray(x >= 0), logp, xp.asarray(-inf))
def log_prob(self, x): x = chainer.as_variable(x) logp = exponential.log(self.alpha) \ + self.alpha * exponential.log(self.scale) \ - (self.alpha + 1) * exponential.log(x) xp = logp.xp return where.where( utils.force_array(x.data >= self.scale.data), logp, xp.array(-xp.inf, logp.dtype))
def _kl_pareto_pareto(dist1, dist2): kl = dist2.alpha * (exponential.log(dist1.scale) - exponential.log(dist2.scale)) \ + exponential.log(dist1.alpha) - exponential.log(dist2.alpha) \ + (dist2.alpha - dist1.alpha) / dist1.alpha xp = kl.xp return where.where( dist1.scale.data >= dist2.scale.data, kl, xp.array(xp.inf, kl.dtype))
def log_prob(self, x): logp = (self.a - 1) * exponential.log(x) \ + (self.b - 1) * exponential.log(1 - x) \ - _lbeta(self.a, self.b) xp = logp.xp inf = xp.full_like(logp.array, xp.inf) if isinstance(x, chainer.Variable): x = x.array return where.where(xp.logical_and(x >= 0, x <= 1), logp, -inf)
def log_prob(self, x): x = chainer.as_variable(x) logp = (self.a - 1) * exponential.log(x) \ + (self.b - 1) * exponential.log(1 - x) \ - _lbeta(self.a, self.b) xp = logp.xp return where.where( utils.force_array((x.array >= 0) & (x.array <= 1)), logp, xp.array(-xp.inf, logp.dtype))
def _kl_uniform_uniform(dist1, dist2): xp = backend.get_array_module(dist1.low) is_inf = xp.logical_or(dist1.high.data > dist2.high.data, dist1.low.data < dist2.low.data) kl = - exponential.log(dist1.high - dist1.low) \ + exponential.log(dist2.high - dist2.low) inf = xp.array(xp.inf, dist1.high.dtype) return where.where(is_inf, inf, kl)
def _kl_uniform_uniform(dist1, dist2): xp = cuda.get_array_module(dist1.low) is_inf = xp.logical_or(dist1.high.data > dist2.high.data, dist1.low.data < dist2.low.data) kl = - exponential.log(dist1.high - dist1.low) \ + exponential.log(dist2.high - dist2.low) inf = xp.full_like(dist1.high.data, numpy.inf) return where.where(is_inf, inf, kl)
def log_prob(self, x): logp = exponential.log(self.lam) - self.lam * x xp = logp.xp if isinstance(x, chainer.Variable): x = x.array inf = xp.full_like(logp.array, xp.inf) return where.where(xp.asarray(x >= 0), logp, xp.asarray(-inf))
def log_prob(self, x): if isinstance(x, chainer.Variable): x = x.data x = x.astype(self.lam.dtype) xp1 = (x + 1).astype(self.lam.dtype) x, xp1 = utils.force_array(x), utils.force_array(xp1) return x * exponential.log(self.lam) - lgamma.lgamma(xp1) - self.lam
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1) logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1) logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def log_prob(self, x): if not isinstance(x, chainer.Variable): x = chainer.Variable(x) xp = backend.get_array_module(x) logp = broadcast.broadcast_to( -exponential.log(self.scale), x.shape) return where.where( utils.force_array( (x.data >= self.low.data) & (x.data <= self.high.data)), logp, xp.array(-xp.inf, logp.dtype))
def log_prob(self, x): if not isinstance(x, chainer.Variable): x = chainer.Variable(x) xp = cuda.get_array_module(x) logp = broadcast.broadcast_to( -exponential.log(self.scale), x.shape) return where.where( utils.force_array( (x.data >= self.low.data) & (x.data < self.high.data)), logp, xp.full_like(logp.array, -numpy.inf))
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def __init__(self, p=None, logit=None): super(Bernoulli, self).__init__() if not (p is None) ^ (logit is None): raise ValueError( "Either `p` or `logit` (not both) must have a value.") with chainer.using_config('enable_backprop', True): if p is None: self.logit = chainer.as_variable(logit) self.p = sigmoid.sigmoid(self.logit) else: self.p = chainer.as_variable(p) self.logit = exponential.log(self.p) \ - logarithm_1p.log1p(-self.p)
def __init__(self, p=None, **kwargs): logit = None if kwargs: logit, = argument.parse_kwargs( kwargs, ('logit', logit)) if not (p is None) ^ (logit is None): raise ValueError( "Either `p` or `logit` (not both) must have a value.") with chainer.using_config('enable_backprop', True): if p is None: logit = chainer.as_variable(logit) self.__log_p = log_softmax.log_softmax(logit, axis=-1) self.__p = exponential.exp(self.__log_p) else: self.__p = chainer.as_variable(p) self.__log_p = exponential.log(self.__p)
def __init__(self, loc, scale=None, **kwargs): super(Normal, self).__init__() log_scale = None if kwargs: log_scale, = argument.parse_kwargs( kwargs, ('log_scale', log_scale)) if not (scale is None) ^ (log_scale is None): raise ValueError( "Either `scale` or `log_scale` (not both) must have a value.") self.loc = chainer.as_variable(loc) with chainer.using_config('enable_backprop', True): if scale is None: self.__log_scale = chainer.as_variable(log_scale) self.__scale = exponential.exp(self.log_scale) else: self.__scale = chainer.as_variable(scale) self.__log_scale = exponential.log(self.scale)
def log_p(self): if self.__p is not None: return exponential.log(self.__p) else: return log_softmax.log_softmax(self.__logit, axis=-1)
def _log_lam(self): return exponential.log(self.lam)
def _kl_geometric_geometric(dist1, dist2): return (1 / dist1.p - 1) \ * (exponential.log(1 - dist1.p) - exponential.log(1 - dist2.p)) \ + exponential.log(dist1.p) - exponential.log(dist2.p)
def log_prob(self, x): return (x - 1) * exponential.log(1 - self.p) + exponential.log(self.p)
def log_cdf(self, x): return exponential.log(self.cdf(x))
def log_scale(self): if self.__log_scale is not None: return chainer.as_variable(self.__log_scale) else: return exponential.log(self.scale)
def logit(self): if self.__logit is not None: return chainer.as_variable(self.__logit) else: return exponential.log(self.p) - logarithm_1p.log1p(-self.p)
def log_p(self): if self.__p is not None: return exponential.log(self.__p) else: return log_softmax.log_softmax(self.__logit, axis=-1)
def log_prob(self, x): logx = exponential.log(x) return LOGPROBC - self._log_sigma - logx \ - (0.5 * (logx - self.mu) ** 2 / self.sigma ** 2)
def black_out(x, t, W, samples, reduce='mean'): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the no loss values. If it is ``'mean'``, this function takes a mean of loss values. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Batch of input vectors. Its shape should be :math:`(N, D)`. t (:class:`~chainer.Variable` or :ref:`ndarray`): Vector of ground truth labels. Its shape should be :math:`(N,)`. Each elements :math:`v` should satisfy :math:`0 \\geq v \\geq V` or :math:`-1` where :math:`V` is the number of label types. W (:class:`~chainer.Variable` or :ref:`ndarray`): Weight matrix. Its shape should be :math:`(V, D)` samples (~chainer.Variable): Negative samples. Its shape should be :math:`(N, S)` where :math:`S` is the number of negative samples. reduce (str): Reduction option. Its value must be either ``'no'`` or ``'mean'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable object holding loss value(s). If ``reduce`` is ``'no'``, the output variable holds an array whose shape is :math:`(N,)` . If it is ``'mean'``, it holds a scalar. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.matmul(neg_emb, x[:, :, None]) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.matmul(pos_emb, x[:, :, None]) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size, )) loss = -(py - logz + _sum.sum(ny, axis=1)) if reduce == 'mean': loss = average.average(loss) return loss
def _log_sigma(self): return exponential.log(self.sigma)
def log_prob(self, x): return (-numpy.log(numpy.pi) + exponential.log(self.scale) - exponential.log((x - self.loc)**2 + self.scale**2))
def entropy(self): return exponential.log(4 * numpy.pi * self.scale)
def entropy(self): return self.k + exponential.log(self.theta) + lgamma.lgamma(self.k) \ + (1 - self.k) * digamma.digamma(self.k)
def log_survival_function(self, x): return exponential.log(self.survival_function(x))
def entropy(self): return 0.5 - LOGPROBC + exponential.log(self.sigma) + self.mu
def log_p(self): return exponential.log(self.p)
def _kl_bernoulli_bernoulli(dist1, dist2): return (dist1.logit - dist2.logit) * (dist1.p - 1.) \ - exponential.log(exponential.exp(-dist1.logit) + 1) \ + exponential.log(exponential.exp(-dist2.logit) + 1)
def _kl_log_normal_log_normal(dist1, dist2): return 0.5 * ((dist1.mu - dist2.mu) ** 2 + dist1.sigma ** 2) / dist2.sigma ** 2 - 0.5 \ + exponential.log(dist2.sigma) - exponential.log(dist1.sigma)
def log_prob(self, x): return - exponential.log(broadcast.broadcast_to(self.scale, x.shape)) \ - 0.5 * (x - broadcast.broadcast_to(self.loc, x.shape)) ** 2 \ / broadcast.broadcast_to(self.scale, x.shape) ** 2 + LOGPROBC
def _log_alpha(self): return exponential.log(self.alpha)
def _kl_normal_normal(dist1, dist2): return exponential.log(dist2.scale) - exponential.log(dist1.scale) \ + 0.5 * (dist1.scale ** 2 + (dist1.loc - dist2.loc) ** 2) \ / dist2.scale ** 2 - 0.5
def _kl_laplace_laplace(dist1, dist2): diff = abs(dist1.loc - dist2.loc) return exponential.log(dist2.scale) - exponential.log(dist1.scale) \ + diff / dist2.scale \ + dist1.scale / dist2.scale * exponential.exp(- diff / dist1.scale) - 1
def infer_initial_states_sctrnn(params, old_model, testing_data, num_timesteps=0, epochs=None, start_is='mean', error_computation='standard', single_recognition=False, hyp_prior=None, external_signal_variance=-1, x_start=None, use_init_state_loss=True): # each trajectory is handled as a separate "class", infer initial states per class num_classes = testing_data.shape[0] # full number of timesteps num_timesteps_orig = int(testing_data.shape[1] / params.num_io) # timesteps to use for inference if num_timesteps == 0: num_timesteps = num_timesteps_orig gpu_id = 0 # -1 for CPU # Determine whether CPU or GPU should be used xp = np if gpu_id >= 0 and cuda.available: print("Use GPU!") cuda.get_device_from_id(gpu_id).use() xp = cuda.cupy else: print("Use CPU!") gpu_id = -1 c = [] num_samples_per_class = 1 for i in range(num_classes): for j in range(num_samples_per_class): c.append(i) c_train = xp.array(c) save_location = "." if os.path.exists("/media/AnjaDataDrive"): save_location = "/media/AnjaDataDrive" save_location += "/results" now = datetime.datetime.now() expStr = str(now.year).zfill(4) + "-" + str( now.month).zfill(2) + "-" + str(now.day).zfill(2) + "_" + str( now.hour).zfill(2) + "-" + str(now.minute).zfill(2) + "_" + str( now.microsecond).zfill(7) + "_inference" save_dir = os.path.join(save_location, expStr) print(save_dir) pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) save_interval = 100 # interval for testing the production capability of the network and saving initial state information save_model_interval = 100 # interval for storing the learned model # Should better already be done outside this method # try: # x_train = range2norm(x_train_orig, params.norm_offset, params.norm_range, minmax = params.minmax) # x_train = xp.float32(x_train) # # N = len(x_train) # except: # print("No normalization applicable...") # x_train = testing_data # CUT PART OF THE TRAINING SIGNAL (COMPLETION TASK) testing_data_cut = testing_data[:, 0:params.num_io * num_timesteps] plot_results(xp.copy(testing_data_cut[0::num_samples_per_class]), num_timesteps, os.path.join(save_dir, 'target_trajectories.png'), params.num_io, twoDim=True) info = "same trajectories (original #timesteps: " + str( num_timesteps_orig) + "), used #timesteps: " + str(num_timesteps) # copy network model and prepare it for backpropagation inference params.learn_weights = False params.learn_bias = False params.epochs = epochs max_epochs = 500 if params.epochs: epoch_array_size = params.epochs else: epoch_array_size = max_epochs model = SCTRNN(params.num_io, params.num_c, params.tau_c, num_classes, init_state_init=params.init_state_init, init_state_learning=params.learn_init_states, weights_learning=params.learn_weights, bias_learning=params.learn_bias, tau_learning=params.learn_tau, pretrained_model=old_model) #model.hyp_prior = params.hyp_prior #model.external_signal_variance = params.external_signal_variance if not hyp_prior is None: model.hyp_prior = hyp_prior params.hyp_prior = hyp_prior if external_signal_variance is None or external_signal_variance >= 0: model.external_signal_variance = external_signal_variance params.external_signal_variance = external_signal_variance params.lr = 0.01 with open(os.path.join(save_dir, "info.txt"), 'w') as f: f.write(params.get_parameter_string()) f.write("\n") f.write(info) f.write("\n") f.close() if start_is is 'mean': model.set_initial_states_mean() elif start_is is 'zero': model.set_initial_states_zero() else: model.initial_states.W.array = start_is #model.apply_estimated_variance = True model.set_init_state_learning(c_train) if gpu_id >= 0: model.to_gpu(gpu_id) testing_data = cuda.to_gpu(testing_data) x_start = cuda.to_gpu(x_start) save_network(save_dir, params=params, model=model, model_filename="network-initial") # Optimizer optimizer = optimizers.Adam(params.lr) optimizer.setup(model) #optimizer.add_hook(chainer.optimizer.WeightDecay(0)) history_init_state_var = np.zeros((epoch_array_size + 1, )) history_init_state_var[0] = np.mean( np.var(model.initial_states.W.array, axis=0)) history_generation_error_proactive = np.empty((num_classes, ), dtype=object) history_generation_error_reactive = np.empty((num_classes, ), dtype=object) history_training_error = np.zeros((epoch_array_size + 1, )) history_training_variance_estimation = np.zeros( (epoch_array_size + 1, num_classes)) history_initial_states = [] likelihood_per_epoch = [] print("actual variance of init_states_0: " + str(history_init_state_var[0])) # Evaluate the performance of the untrained network test_batch_size = np.min( [model.initial_states.W.array.shape[0], testing_data.shape[0]]) res, resv, resm = model.generate(model.initial_states.W.array, num_timesteps_orig, add_variance_to_output=0, x_start=x_start) results = res #cuda.to_cpu(res) for i in range(num_classes): generation_error = chainer.functions.mean_squared_error( results[i, :], testing_data[i, :]).array.tolist() history_generation_error_proactive[i] = [generation_error] with open(os.path.join(save_dir, "evaluation.txt"), 'a') as f: f.write("before learning: pattern generation error (proactive): " + str(history_generation_error_proactive[i]) + "\n") plot_results(xp.copy(results), num_timesteps_orig, os.path.join(save_dir, "proactive_before-learning"), params.num_io, twoDim=True) res, resv, resm, pe, wpe, respost = model.generate( model.initial_states.W.array, num_timesteps_orig, external_input=xp.asarray(testing_data[0::num_samples_per_class, :]), add_variance_to_output=0, x_start=x_start) results = res #cuda.to_cpu(res) for i in range(num_classes): generation_error = chainer.functions.mean_squared_error( results[i, :], testing_data[i, :]).array.tolist() history_generation_error_reactive[i] = [generation_error] with open(os.path.join(save_dir, "evaluation.txt"), 'a') as f: f.write("before learning: pattern generation error (reactive): " + str(history_generation_error_reactive[i]) + "\n") plot_results(xp.copy(results), num_timesteps_orig, os.path.join(save_dir, "reactive_before-learning"), params.num_io, twoDim=True) # arrays for tracking likelihood and determining stop condition all_mean_diffs = [] all_std_diffs = [] m1s = [] s1s = [] # tmp_epoch_marker = 0 # conv_eval_interval = 1000 # the length of the interval to consider for determining convergence for epoch in range(1, epoch_array_size + 1): epochStart = time.time() outv = np.zeros((num_timesteps, )) # permutate samples in each epoch so that they are randomly ordered perm = np.random.permutation(testing_data_cut.shape[0]) # here, one batch equals the full training set x_batch = xp.asarray(testing_data_cut[perm]) x_batch = x_batch + 0.01 * xp.random.randn( x_batch.shape[0], x_batch.shape[1]).astype('float32') model.set_init_state_learning(c_train[perm]) mean_init_states = chainer.Variable(xp.zeros((), dtype=xp.float32)) mean_init_states = chainer.functions.average(model.initial_states.W, axis=0) #keepdims=True #mean_init_states = xp.mean(c0.array,axis=0) # using this instead causes no difference in resulting gradient of c0 # initialize error acc_loss = chainer.Variable(xp.zeros( (), dtype=xp.float32)) # for weight backprop acc_init_loss = chainer.Variable(xp.zeros( (), dtype=xp.float32)) # for init states backprop err = xp.zeros(()) # for evaluation only # clear gradients from previous batch model.cleargrads() # clear output and variance estimations from previous batch model.reset_current_output() t = 0 # iterate through time x_t = x_batch[:, params.num_io * t:params.num_io * (t + 1)] # next time step to be predicted (for evaluation) x_t1 = x_batch[:, params.num_io * (t + 1):params.num_io * (t + 2)] # x_t = xp.reshape(x_batch[0][t,:], (1, params.num_io)) # x_t1 = xp.reshape(x_batch[0][t+1,:], (1, params.num_io)) # for i in range(1, params.batch_size): # x_t = np.concatenate((x_t, xp.reshape(x_batch[i][t,:], (1,params.num_io))),axis=0) # x_t1 = np.concatenate((x_t1, xp.reshape(x_batch[i][t+1,:], (1,params.num_io))),axis=0) # execute first forward step u_h, y, v = model( x_t, None ) # initial states of u_h are set automatically according to model.classes # noisy output estimation #y_out = y.array + xp.sqrt(v.array) * xp.random.randn() # compute prediction error, averaged over batch if error_computation == 'standard': # compare network prediction to ground truth loss_i = chainer.functions.gaussian_nll(chainer.Variable(x_t1), y, exponential.log(v)) elif error_computation == 'integrated': # compare network prediction to posterior of perception loss_i = chainer.functions.gaussian_nll(model.current_x, y, exponential.log(v)) acc_loss += loss_i acc_loss += loss_i # compute error for evaluation purposes err += chainer.functions.mean_squared_error( chainer.Variable(x_t), y).array.reshape(()) * params.batch_size outv[t] = xp.mean(v.array) # rollout trajectory for t in range(1, num_timesteps - 1): # current time step x_t = x_batch[:, params.num_io * t:params.num_io * (t + 1)] # next time step to be predicted (for evaluation) x_t1 = x_batch[:, params.num_io * (t + 1):params.num_io * (t + 2)] u_h, y, v = model(x_t, u_h) # noisy output estimation #y_out = y.array + xp.sqrt(v.array) * xp.random.randn() # compute error for backprop for weights if error_computation == 'standard': loss_i = chainer.functions.gaussian_nll( chainer.Variable(x_t1), y, exponential.log(v)) elif error_computation == 'integrated': integrated_x = params.training_external_contrib * chainer.Variable( x_t1) + (1 - params.training_external_contrib) * ( y + chainer.functions.sqrt(v) * xp.random.randn()) loss_i = chainer.functions.gaussian_nll( integrated_x, y, exponential.log(v)) acc_loss += loss_i # compute error for evaluation purposes err += chainer.functions.mean_squared_error( chainer.Variable(x_t), y).array.reshape(()) * params.batch_size outv[t] = xp.mean(v.array) # for each training sequence of this batch: compute loss for maintaining desired initial state variance if not single_recognition and use_init_state_loss: for s in range(len(c_train)): if gpu_id >= 0: acc_init_loss += chainer.functions.gaussian_nll( model.initial_states()[model.classes][s], mean_init_states, xp.ones(mean_init_states.shape) * exponential.log( cuda.to_gpu(params.init_state_var, device=gpu_id))) else: acc_init_loss += chainer.functions.gaussian_nll( model.initial_states()[model.classes][s], mean_init_states, exponential.log(params.init_state_var)) # compute gradients # (gradients from L_out and L_init are summed up) # gradient of initial states equals: # 1/params.init_state_var * (c0[cl]-mean_init_states).array acc_init_loss.backward() else: epochBatchProcessed = time.time() acc_loss.backward() print("update") optimizer.update() print("Done epoch " + str(epoch)) error = err / params.batch_size / num_timesteps mean_estimated_var = xp.mean(outv) history_training_error[epoch] = error history_training_variance_estimation[epoch, :] = mean_estimated_var print("train MSE = " + str(error) + "\nmean estimated var: " + str(mean_estimated_var)) print("init_states = [" + str(model.initial_states.W.array[0][0]) + "," + str(model.initial_states.W.array[0][1]) + "...], var: " + str(np.mean(np.var(model.initial_states.W.array, axis=0))) + ", accs: " + str(acc_loss) + " + " + str(acc_init_loss)) likelihood_per_epoch.append( np.float64(acc_loss.array + acc_init_loss.array)) history_init_state_var[epoch] = np.mean( np.var(model.initial_states.W.array, axis=0)) with open(os.path.join(save_dir, "evaluation.txt"), 'a') as f: f.write("epoch: " + str(epoch) + "\n") f.write("train MSE = " + str(error) + "\nmean estimated var: " + str(mean_estimated_var)) f.write("initial state var: " + str(history_init_state_var[epoch]) + ", precision loss: " + str(acc_loss) + ", variance loss: " + str(acc_init_loss) + "\ninit states:\n") for i in range(num_classes): f.write("\t[" + str(model.initial_states.W[i][0]) + "," + str(model.initial_states.W[i][1]) + "...]\n") f.close() if epoch % save_interval == 1 or epoch == params.epochs: # evaluate proactive generation res, resv, resm, u_h_history = model.generate( model.initial_states.W.array, num_timesteps_orig, add_variance_to_output=0, additional_output='activations', x_start=x_start) results = res #cuda.to_cpu(res) plot_results(xp.copy(results), num_timesteps_orig, os.path.join( save_dir, "proactive_epoch-" + str(epoch).zfill(len(str(epochs)))), params.num_io, twoDim=True) for i in range(num_classes): generation_error = chainer.functions.mean_squared_error( results[i, :], testing_data[i, :]).array.tolist() history_generation_error_proactive[i].append(generation_error) with open(os.path.join(save_dir, "evaluation.txt"), 'a') as f: f.write("pattern generation error (proactive): " + str(generation_error) + "\n") f.close() # evaluate reactive generation res, resv, resm, pe, wpe, u_h_history, respost = model.generate( model.initial_states.W.array, num_timesteps_orig, external_input=xp.asarray( testing_data[0::num_samples_per_class, :]), additional_output='activations', x_start=x_start) results = res #cuda.to_cpu(res) plot_results(xp.copy(results), num_timesteps_orig, os.path.join( save_dir, "reactive_epoch-" + str(epoch).zfill(len(str(epochs)))), params.num_io, twoDim=True) for i in range(test_batch_size): generation_error = chainer.functions.mean_squared_error( results[i, :], testing_data[i, :]).array.tolist() history_generation_error_reactive[i].append(generation_error) with open(os.path.join(save_dir, "evaluation.txt"), 'a') as f: f.write("pattern generation error (reactive): " + str(generation_error) + "\n") f.close() if epoch % save_model_interval == 1 or epoch == params.epochs: save_network(save_dir, params, model, model_filename="network-epoch-" + str(epoch).zfill(len(str(epochs)))) np.save(os.path.join(save_dir, "history_init_state_var"), np.array(history_init_state_var)) np.save( os.path.join(save_dir, "history_generation_error_proactive"), np.array(history_generation_error_proactive)) np.save( os.path.join(save_dir, "history_generation_error_reactive"), np.array(history_generation_error_reactive)) np.save(os.path.join(save_dir, "history_training_error"), np.array(history_training_error)) np.save( os.path.join(save_dir, "history_training_variance_estimation"), np.array(history_training_variance_estimation)) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(np.arange(0, len(history_init_state_var)), history_init_state_var) plt.title("init state variance") fig.savefig(os.path.join(save_dir, "init-state-var")) plt.close() fig = plt.figure() ax = fig.add_subplot(121) for i in range(num_classes): ax.plot( np.arange(0, len(history_generation_error_proactive[i])) * save_interval, history_generation_error_proactive[i]) ax = fig.add_subplot(122) for i in range(num_classes): ax.plot( np.arange(0, len(history_generation_error_reactive[i])) * save_interval, history_generation_error_reactive[i], label=str(i)) plt.title("generation error (proactive / reactive)") plt.legend() fig.savefig(os.path.join(save_dir, "generation-error")) plt.close() plt.figure() plt.plot(np.arange(len(all_std_diffs)), all_std_diffs, 'bo', label='std diff') plt.plot(np.arange(len(all_mean_diffs)), all_mean_diffs, 'ro', label='mean diff') plt.legend() plt.savefig(os.path.join(save_dir, 'convergence-condition.png')) plt.close() history_initial_states.append(model.initial_states.W.array.copy()) # if no epoch number is decided, stop when error is below a threshold if not epochs: if error < 0.01: break save_network(save_dir, params, model, model_filename="network-final") return model.initial_states, history_initial_states, results, resm, save_dir
def log_prob(self, x): return - _lbeta(self.alpha) \ + sum_mod.sum((self.alpha - 1) * exponential.log(x), axis=-1)
def entropy(self): return exponential.log(self.scale)
def _kl_bernoulli_bernoulli(dist1, dist2): return (dist1.logit - dist2.logit) * (dist1.p - 1.) \ - exponential.log(exponential.exp(-dist1.logit) + 1) \ + exponential.log(exponential.exp(-dist2.logit) + 1)
def _triangular_logdet(x): diag = diagonal.diagonal(x, axis1=-2, axis2=-1) return sum_mod.sum(exponential.log(abs(diag)), axis=-1)
def _kl_gamma_gamma(dist1, dist2): return (dist1.k - dist2.k) * digamma.digamma(dist1.k) \ - (lgamma.lgamma(dist1.k) - lgamma.lgamma(dist2.k)) \ + dist2.k\ * (exponential.log(dist2.theta) - exponential.log(dist1.theta)) \ + dist1.k * (dist1.theta / dist2.theta - 1)
def _log_scale(self): return exponential.log(self.scale)
def log_prob(self, x): return ( - lgamma.lgamma(self._half_k) - self._half_k * numpy.log(2.) + (self._half_k - 1) * exponential.log(x) - 0.5 * x)
def log_prob(self, x): logx = exponential.log(x) return LOGPROBC - exponential.log(self.sigma) - logx \ - (0.5 * (logx - self.mu) ** 2 / self.sigma ** 2)
def _log_alpha(self): return exponential.log(self.alpha)
def _log_scale(self): return exponential.log(self.scale)
def entropy(self): return exponential.log(self.scale)
def log_prob(self, x): scale = self.scale return - exponential.log(2 * scale) - abs(x - self.loc) / scale
# next time step to be predicted (for evaluation) x_t1 = x_batch[:, p.num_io * (t + 1):p.num_io * (t + 2)] # execute first forward step u_h, y, v = model( xp.copy(x_t), None ) # initial states of u_h are set automatically according to model.classes # noisy output estimation # y_out = y.array + xp.sqrt(v.array) * xp.random.randn() # compute prediction error, averaged over batch if prediction_error_type == 'standard': # compare network prediction to ground truth loss_i = chainer.functions.gaussian_nll(chainer.Variable(x_t1), y, exponential.log(v)) elif prediction_error_type == 'integrated': # compare network prediction to posterior of perception loss_i = chainer.functions.gaussian_nll(model.current_x, y, exponential.log(v)) acc_loss += loss_i # compute error for evaluation purposes err += chainer.functions.mean_squared_error( chainer.Variable(x_t), y).array.reshape(()) * p.batch_size estimated_variance[t] = xp.mean(v.array) # rollout trajectory for t in range(1, num_timesteps - 1): # current time step
def entropy(self): return 1. + exponential.log(2 * self.scale)
def _kl_laplace_laplace(dist1, dist2): diff = abs(dist1.loc - dist2.loc) return exponential.log(dist2.scale) - exponential.log(dist1.scale) \ + diff / dist2.scale \ + dist1.scale / dist2.scale * exponential.exp(- diff / dist1.scale) - 1
def logit(self): if self.__logit is not None: return chainer.as_variable(self.__logit) else: return exponential.log(self.p) - logarithm_1p.log1p(-self.p)
def entropy(self): return 1. + exponential.log(2 * self.scale)
def log_prob(self, x): bl = broadcast.broadcast_to(self.loc, x.shape) bs = broadcast.broadcast_to(self.scale, x.shape) return - exponential.log(2 * bs) - abs(x - bl) / bs
def log_prob(self, x): return - _lbeta(self.alpha) \ + sum_mod.sum((self.alpha - 1) * exponential.log(x), axis=-1)