def forward_backward(self, data, label, temperature=5.0): data_slice = [] label_slice = [] for i in range(len(self._ctxs)): data_slice.append(data[i*self._dev_batch_size:(i+1)*self._dev_batch_size]) label_slice.append(label[i*self._dev_batch_size:(i+1)*self._dev_batch_size]) gumbel_list = [] for k in self._gumbel_var_names: if not k[0] in self._arg_dict[0].keys(): break tmp_gumbel = sample_gumbel((k[1], )) gumbel_list.append(1.0 * mx.nd.array(tmp_gumbel)) for i in range(len(self._ctxs)): self._arg_dict[i][self._data_name][:] = data_slice[i] if self._model_type != 'softmax': label_index = label_slice[i] self._arg_dict[i]['label_index'][:] = label_index label_ = mx.nd.one_hot(label_slice[i], self._label_shape[0]) self._arg_dict[i][self._label_name][:] = label_ if "temperature" in self._arg_dict[i].keys(): self._arg_dict[i]["temperature"][:] = temperature for k, v in self._b.items(): self._arg_dict[i][k][:] = v for idx, k in enumerate(self._gumbel_var_names): if not k[0] in self._arg_dict[i].keys(): break self._arg_dict[i][k[0]][:] = gumbel_list[idx] # self._arg_dict[i][k[0]][:] = 1.0 * mx.nd.zeros((k[1])) self._exe[i].forward(is_train=True) self._exe[i].backward()
scores = multi_mnist_cnn.deepnn(l, X, 1) scores = tf.reshape(scores, [M, n, 1]) P_hat = util.neuralsort(scores, temperature) losses = tf.nn.softmax_cross_entropy_with_logits_v2( labels=P_true, logits=tf.log(P_hat + 1e-20), dim=2) losses = tf.reduce_mean(losses, axis=-1) loss = tf.reduce_mean(losses) if method == 'stochastic_neuralsort': scores = multi_mnist_cnn.deepnn(l, X, 1) scores = tf.reshape(scores, [M, n, 1]) P_hat = util.neuralsort(scores, temperature) scores_sample = tf.tile(scores, [n_s, 1, 1]) scores_sample += util.sample_gumbel([M * n_s, n, 1]) P_hat_sample = util.neuralsort( scores_sample, temperature) P_true_sample = tf.tile(P_true, [n_s, 1, 1]) losses = tf.nn.softmax_cross_entropy_with_logits_v2( labels=P_true_sample, logits=tf.log(P_hat_sample + 1e-20), dim=2) losses = tf.reduce_mean(losses, axis=-1) loss = tf.reduce_mean(losses) else: raise ValueError("No such method.") def vec_gradient(l): # l is a scalar gradient = tf.gradients(l, tf.trainable_variables()) vec_grads = [tf.reshape(grad, [-1]) for grad in gradient] # flatten
point_estimates = tf.reduce_sum(prob_median * regression_candidates, axis=1) exp_loss = tf.squared_difference(y, point_estimates) loss_phi = tf.reduce_mean(exp_loss) loss_theta = loss_phi P_hat_eval = sinkhorn_operator(pre_sinkhorn, temp=1e-20) prob_median_eval = get_median_probs(P_hat_eval) elif method == 'gumbel_sinkhorn': with tf.variable_scope('phi'): representations = multi_mnist_cnn.deepnn(l, X, n) pre_sinkhorn_orig = tf.reshape(representations, [M, n, n]) pre_sinkhorn = tf.tile(pre_sinkhorn_orig, [n_s, 1, 1]) pre_sinkhorn += util.sample_gumbel([n_s * M, n, n]) with tf.variable_scope('theta'): regression_candidates = multi_mnist_cnn.deepnn(l, X, 1) regression_candidates = tf.reshape(regression_candidates, [M, n]) P_hat = sinkhorn_operator(pre_sinkhorn, temp=temp) prob_median = get_median_probs(P_hat) prob_median = tf.reshape(prob_median, [n_s, M, n]) point_estimates = tf.reduce_sum(prob_median * regression_candidates, axis=2) exp_loss = tf.squared_difference(y, point_estimates) loss_phi = tf.reduce_mean(exp_loss) loss_theta = loss_phi
def gumbel_sinkhorn(log_alpha, temp=1.0, n_samples=1, noise_factor=1.0, n_iters=20, squeeze=True): """Random doubly-stochastic matrices via gumbel noise. In the zero-temperature limit sinkhorn(log_alpha/temp) approaches a permutation matrix. Therefore, for low temperatures this method can be seen as an approximate sampling of permutation matrices, where the distribution is parameterized by the matrix log_alpha The deterministic case (noise_factor=0) is also interesting: it can be shown that lim t->0 sinkhorn(log_alpha/t) = M, where M is a permutation matrix, the solution of the matching problem M=arg max_M sum_i,j log_alpha_i,j M_i,j. Therefore, the deterministic limit case of gumbel_sinkhorn can be seen as approximate solving of a matching problem, otherwise solved via the Hungarian algorithm. Warning: the convergence holds true in the limit case n_iters = infty. Unfortunately, in practice n_iter is finite which can lead to numerical instabilities, mostly if temp is very low. Those manifest as pseudo-convergence or some row-columns to fractional entries (e.g. a row having two entries with 0.5, instead of a single 1.0) To minimize those effects, try increasing n_iter for decreased temp. On the other hand, too-low temperature usually lead to high-variance in gradients, so better not choose too low temperatures. Args: log_alpha: 2D tensor (a matrix of shape [N, N]) or 3D tensor (a batch of matrices of shape = [batch_size, N, N]) temp: temperature parameter, a float. n_samples: number of samples noise_factor: scaling factor for the gumbel samples. Mostly to explore different degrees of randomness (and the absence of randomness, with noise_factor=0) n_iters: number of sinkhorn iterations. Should be chosen carefully, in inverse corresponde with temp to avoid numerical stabilities. squeeze: a boolean, if True and there is a single sample, the output will remain being a 3D tensor. Returns: sink: a 4D tensor of [batch_size, n_samples, N, N] i.e. batch_size *n_samples doubly-stochastic matrices. If n_samples = 1 and squeeze = True then the output is 3D. log_alpha_w_noise: a 4D tensor of [batch_size, n_samples, N, N] of noisy samples of log_alpha, divided by the temperature parameter. If n_samples = 1 then the output is 3D. """ n = tf.shape(log_alpha)[1] log_alpha = tf.reshape(log_alpha, [-1, n, n]) batch_size = tf.shape(log_alpha)[0] log_alpha_w_noise = tf.tile(log_alpha, [n_samples, 1, 1]) if noise_factor == 0: noise = 0.0 else: noise = sample_gumbel([n_samples * batch_size, n, n]) * noise_factor log_alpha_w_noise += noise log_alpha_w_noise /= temp sink = sinkhorn_operator(log_alpha_w_noise, n_iters) if n_samples > 1 or squeeze is False: sink = tf.reshape(sink, [n_samples, batch_size, n, n]) sink = tf.transpose(sink, [1, 0, 2, 3]) log_alpha_w_noise = tf.reshape( log_alpha_w_noise, [n_samples, batch_size, n, n]) log_alpha_w_noise = tf.transpose(log_alpha_w_noise, [1, 0, 2, 3]) return sink, log_alpha_w_noise