def gumbel_softmax(logits, temperature: float = 1.0, eps: float = 1E-10, hard=True, use_np_gumbel: bool = True): r"""Perform the gumbel-softmax trick to generate differentiable one-hot vectors from the input logits. Here, the gumbel distribution is Gumbel(\alpha) = -log (-log U) + \log \alpha, in which U is the uniform(0, 1) distribution. A nice property of Gumbel is: \argmax({Gumbel(\alpha_i)}) \sim multinomial(\alpha_i) The Gumbel-Softmax trick is to use the softmax + straight-through estimator to produce one-hot vectors that represent the sampling result. References: 1. https://en.wikipedia.org/wiki/Gumbel_distribution 2. [ICLR2017] Categorical Reparameterization with Gumbel-Softmax Parameters ---------- logits Logits. Shape (..., V) temperature The temperature that controls the eps The eps for stability of gradient hard Whether to use the straight-through estimator to produce one-hot vectors. use_np_gumbel Whether to use the random.gumble operator Returns ------- ret The returned output. Shape (..., V) """ # TODO(sxjscience) Investigate the impact of random.gumbel: # Actually, random.gumble has no eps and may have problem in calculating the gradient. if use_np_gumbel: gumbels = np.random.gumbel(np.zeros_like(logits)) else: u = np.random.uniform(np.zeros_like(logits), 1) gumbels = -np.log(-np.log(u + eps) + eps) y = npx.softmax((gumbels + logits) / temperature, axis=-1) if hard: y_hard = np.max(y, axis=-1, keepdims=True) == y y_hard = npx.stop_gradient(y_hard - y) + y return y_hard else: return y
def trunc_gumbel(logits, truncation): """Sample from the TruncGumbel distribution. The cumulative density function (CDF) of the Truncated Gumbel distribution is defined as TruncGumbel(\alpha, truncation) \prop max(Gumbel(\alpha), truncation) To sample from the distribution, we can use the CDF inversion technique. References: 1. [NIPS2014] A* Sampling, https://papers.nips.cc/paper/5449-a-sampling.pdf 2. https://cmaddis.github.io/gumbel-machinery Parameters ---------- logits The logits. Shape (...,) truncation The truncation. Shape (...,) Returns ------- samples Samples from the TruncGumbel(logits, truncation) Shape (...,) """ gumbels = np.random.gumbel(np.zeros_like(logits)) + logits return -np.log(np.exp(-gumbels) + np.exp(-truncation))
def relative_position_bucket(relative_position, bidirectional: bool = True, num_buckets: int = 32, max_distance: int = 128): """Map the relative position to buckets. The implementation is consistent with that in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637) where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates that the memory slot is in a later timestamp than the query slot. After handling the bidirectional case (see below), the implementation uses the first half of buckets to store exact differences and the second half to store the differences after a logrithmic transformation. Parameters ---------- relative_position Shape (...,) bidirectional Whether we are dealing with bidirectional attention. If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), and negative shifts are mapped to [num_buckets // 2, num_buckets). num_buckets The number of buckets. max_distance Maximum distance. Positions that fall outside of 'max_distance' will be trimmed. Returns ------- buckets Shape (...,). It has the same shape as the `relative_position`. It will have int32 type. """ ret = 0 relative_position = -relative_position if bidirectional: assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \ 'divisible by 2.' num_buckets //= 2 ret = ret + (relative_position < 0).astype(np.int32) * num_buckets relative_position = np.abs(relative_position) else: # Clip all the negative values to 0 relative_position = np.clip(relative_position, a_min=0, a_max=None) # Now, the relative_position is in the range [0, inf) # Half of the buckets deal with the exact increments, # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2 max_exact = num_buckets // 2 is_small = relative_position < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to # max_distance val_if_large = max_exact + ( np.log(relative_position.astype(np.float32) / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).astype(np.int32) val_if_large = np.minimum(val_if_large, num_buckets - 1) ret = ret + np.where(is_small, relative_position, val_if_large) return ret
def offset_boxes(anchors, assigned_bb, eps=1e-6): c_anc = d2l.box_corner_to_center(anchors) c_assigned_bb = d2l.box_corner_to_center(assigned_bb) offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:] # standard deviation = 0.1 offset_wh = 5 * np.log( eps + c_assigned_bb[:, 2:] / c_anc[:, 2:]) # standard deviation = 0.2 offset = np.concatenate([offset_xy, offset_wh], axis=1) print(offset.shape, 'das') return offset
def test_gammaln(): A = np.ones((2, INT_OVERFLOW)) A[0][0] = 5 A.attach_grad() with mx.autograd.record(): B = npx.gammaln(A) assert B.shape == (2, INT_OVERFLOW) assert_almost_equal(B[0][0], np.array([np.log(24)]), \ rtol=1e-3, atol=1e-5) B.backward() assert A.grad.shape == (2, INT_OVERFLOW) assert_almost_equal(A.grad[0][0], np.array([1.5061178]), \ rtol=1e-3, atol=1e-5)
def forward(self, length_predictions, labels): """ Returns Poisson loss and output given data and expected integers as labels. :param length_predictions: Length predictions. Shape: (batch_size,). :param labels: Targets. Shape: (batch_size,). :return: Poisson loss of length predictions of the batch, and number of samples (batch size). """ # (batch_size,) loss = length_predictions - labels * np.log(np.maximum(1e-10, length_predictions)) # (1,) loss = np.sum(loss * self.weight) num_samples = np.sum(np.ones_like(length_predictions)) return loss, num_samples
def test_power(): A = np.full((2, INT_OVERFLOW), 2) B = np.ones((2, INT_OVERFLOW)) B[-1, -1] = 3 A.attach_grad() B.attach_grad() with mx.autograd.record(): C = np.power(A, B) C.backward() assert C.shape == A.shape assert C[-1, -1] == 8 assert A.grad.shape == A.shape assert A.grad[-1, -1] == 12 assert B.grad.shape == B.shape assert_almost_equal(B.grad[-1, -1], 2**3 * np.log(2), rtol=1e-5, atol=1e-5)
def test_ldexp(): A = np.ones((2, INT_OVERFLOW)) B = np.ones((2, INT_OVERFLOW)) A[-1, -1], B[-1, -1] = 5, 2 A.attach_grad() B.attach_grad() with mx.autograd.record(): C = np.ldexp(A, B) C.backward() assert C.shape == A.shape assert C[-1, -1] == 20 assert A.grad.shape == A.shape assert A.grad[-1, -1] == 4 assert B.grad.shape == B.shape assert_almost_equal(B.grad[-1, -1], A[-1, -1] * 2**B[-1, -1] * np.log(2), \ rtol=1e-5, atol=1e-5)
def linear_interpolation(predictions): return -np.log(utils.average_arrays(predictions)) # pylint: disable=invalid-unary-operand-type
def _init_sinusoidal_base(units): half_units = units // 2 val = np.log(10000) / (half_units - 1) val = np.exp(np.arange(half_units, dtype=np.float32) * -val) return val
def log_rmse(net, features, labels): #To further stabilize the value when the logarithm is taken, set the #value less than 1 as 1 clipped_preds = np.clip(net(features), 1, float('inf')) return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())
def dynamic_masking(self, input_ids, valid_lengths): # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked` # that control the masking status for each positions in the sequence. """ Generate masking positions on-the-fly instead of during preprocessing Parameters ---------- input_ids The batchified input_ids with shape (batch_size, max_seq_length) valid_lengths The batchified valid_lengths with shape (batch_size, ) Returns ------ masked_input_ids The masked input sequence with 15% tokens are masked with [MASK] shape (batch_size, max_seq_length) length_masks The masking matrix for the whole sequence that indicates the positions are greater than valid_length. shape (batch_size, max_seq_length) unmasked_tokens The original tokens that appear in the unmasked input sequence shape (batch_size, num_masked_positions) masked_positions The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions) shape (batch_size, num_masked_positions) masked_lm_weights The weight matrix containing 0 or 1 to mark the actual effect of masked positions shape (batch_size, num_masked_positions) """ N = self._max_num_masked_position # Only valid token without special token are allowed to mask valid_candidates = np.ones_like(input_ids, dtype=np.bool) ignore_tokens = [ self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id ] for ignore_token in ignore_tokens: # TODO(zheyuye), Update when operation += supported valid_candidates = valid_candidates * \ np.not_equal(input_ids, ignore_token) valid_lengths = valid_lengths.astype(np.float32) valid_candidates = valid_candidates.astype(np.float32) num_masked_position = mxnp.maximum( 1, np.minimum(N, round(valid_lengths * self._mask_prob))) # Get the masking probability of each position sample_probs = self._proposal_distribution * valid_candidates sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True) sample_probs = npx.stop_gradient(sample_probs) gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs)) # Following the instruction of official repo to avoid deduplicate postions # with Top_k Sampling as https://github.com/google-research/electra/issues/41 masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels, k=N, axis=-1, ret_typ='indices', dtype=np.int32) masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions), sequence_length=num_masked_position, use_sequence_length=True, axis=1, value=0) masked_positions = masked_positions * masked_weights length_masks = npx.sequence_mask(mxnp.ones_like(input_ids, dtype=np.float32), sequence_length=valid_lengths, use_sequence_length=True, axis=1, value=0) unmasked_tokens = select_vectors_by_position( input_ids, masked_positions) * masked_weights masked_weights = masked_weights.astype(np.float32) replaced_positions = (mxnp.random.uniform( mxnp.zeros_like(masked_positions), mxnp.ones_like( masked_positions)) < self._replace_prob) * masked_positions # dealing with multiple zero values in replaced_positions which causes # the [CLS] being replaced filled = mxnp.where(replaced_positions, self.vocab.mask_id, self.vocab.cls_id).astype(np.int32) # Masking token by replacing with [MASK] masked_input_ids = update_vectors_by_position(input_ids, filled, replaced_positions) # Note: It is likely have multiple zero values in masked_positions if number of masked of # positions not reached the maximum. However, this example hardly exists since valid_length # is almost always equal to max_seq_length masked_input = self.MaskedInput(input_ids=masked_input_ids, masks=length_masks, unmasked_tokens=unmasked_tokens, masked_positions=masked_positions, masked_weights=masked_weights) return masked_input
def cross_entropy(y_hat, y): # picking here is an implicit multiplying, since all other entry in y are 0 # the point to take here is that y is an array with only 0 and 1, and only # one of them is 1, basically a masking array return - np.log(y_hat[range(len(y_hat)), y])
def loss(y_hat, y): m = y.shape[0] p = softmax(y_hat) return np.sum(-np.log(p[range(m), y]))
from concurrent.futures import ProcessPoolExecutor, as_completed import d2l import tqdm from mxnet import autograd, gluon, init, np, npx from mxnet.gluon import nn from mxnet.gluon.utils import split_and_load import logger npx.set_np() log = logger.get_logger('BERT') INVALID_THD = 10 # Invalid throughput threshold ratio. INVALID_LOG_THD = np.log(INVALID_THD) ### Class Declarations class BERTEncoder(nn.Block): """BERT Encoder class.""" def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, **kwargs): super(BERTEncoder, self).__init__(**kwargs) self.blks = nn.Sequential() for _ in range(num_layers): self.blks.add( d2l.EncoderBlock(num_hiddens, ffn_num_hiddens, num_heads, dropout))
def forward(self, positive, negative): distances = positive - negative loss = - np.sum(np.log(npx.sigmoid(distances)), 0, keepdims=True) return loss
def cross_entropy(y_hat, y): return -np.log(y_hat[range(len(y_hat)), y])
def log_linear_interpolation(predictions): log_probs = utils.average_arrays([np.log(p) for p in predictions]) return -npx.log_softmax(log_probs) # pylint: disable=invalid-unary-operand-type
def log_rmse(net, features, labels): # to futher stabilize the value when the log is taken # set the value less than 1 as 1 net_out = net(features) clipped_preds = np.clip(net_out, 1, float('inf')) return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())