def forward(self, input): out1 = self.block1(input) att2 = self.attn(input) att2 = self.gamma.data() * att2 out = (input + out1) * (FF.ones_like(out1) + att2) return out
def test_getitem_autograd(np_array, index): x = np.array(np_array, dtype=np_array.dtype) x.attach_grad() with autograd.record(): y = x[index] y.backward() value = np.ones_like(y) x_grad = np.zeros_like(x) x_grad[index] = value assert same(x_grad.asnumpy(), x.grad.asnumpy())
def forward(self, lengths): if self.alpha == 0.0: if isinstance(lengths, (int, float)): return 1.0 else: return np.ones_like(lengths) else: numerator = self.beta + lengths if self.beta != 0.0 else lengths numerator = numerator ** self.alpha if self.alpha != 1.0 else numerator return numerator / self.denominator
def forward(self, input): # =========== UNet branch =========== out10 = self.conv_init_1(input) out1 = self.compr11(out10) out1 = FFx.relu(out1) out1 = self.compr12(out1) out1 = FFx.relu(out1) out1 = self.expand1(out1, out10) out1 = FFx.relu(out1) # =========== \capNet branch =========== out20 = self.conv_init_2(input) out2 = self.expand2(out20) out2 = FFx.relu(out2) out2 = self.compr21(out2) out2 = FFx.relu(out2) out2 = self.compr22(FF.concatenate([out2, out20], axis=1)) out2 = FFx.relu(out2) att = self.gamma1.data() * self.att(input) ratt122 = self.gamma2.data() * self.ratt122(out1, out2, out2) ratt211 = self.gamma3.data() * self.ratt211(out2, out1, out1) ones1 = FF.ones_like(out10) ones2 = FF.ones_like(input) # Enhanced output of 1, based on memory of 2 out122 = out1 * (ones1 + ratt122) # Enhanced output of 2, based on memory of 1 out211 = out2 * (ones1 + ratt211) out12 = FFx.relu(self.collect(FF.concatenate([out122, out211], axis=1))) # Emphasize residual output from memory on input out_res = (input + out12) * (ones2 + att) return out_res
def forward(self, length_predictions, labels): """ Returns MSE loss. :param length_predictions: Length predictions. Shape: (batch_size,). :param labels: Targets. Shape: (batch_size,). :return: MSE loss of length predictions of the batch. """ # (batch_size,) loss = (self.weight / 2) * np.square(length_predictions - labels) # (1,) loss = np.sum(loss) num_samples = np.sum(np.ones_like(length_predictions)) return loss, num_samples
def forward(self, length_predictions, labels): """ Returns Poisson loss and output given data and expected integers as labels. :param length_predictions: Length predictions. Shape: (batch_size,). :param labels: Targets. Shape: (batch_size,). :return: Poisson loss of length predictions of the batch, and number of samples (batch size). """ # (batch_size,) loss = length_predictions - labels * np.log(np.maximum(1e-10, length_predictions)) # (1,) loss = np.sum(loss * self.weight) num_samples = np.sum(np.ones_like(length_predictions)) return loss, num_samples
def forward(self, input): # =========== UNet branch =========== out10 = self.conv_init_1(input) out1 = self.compr11(out10) out1 = FFx.relu(out1) #print (out1.shape) out1 = self.compr12(out1) out1 = FFx.relu(out1) #print (out1.shape) out1 = self.expand1(out1, out10) out1 = FFx.relu(out1) # =========== \capNet branch =========== out20 = self.conv_init_2(input) out2 = self.expand2(out20) out2 = FFx.relu(out2) out2 = self.compr21(out2) out2 = FFx.relu(out2) out2 = self.compr22(out2, out20) att = self.gamma1.data() * self.att(input) ratt122 = self.gamma2.data() * self.ratt122(out1, out2, out2) ratt211 = self.gamma3.data() * self.ratt211(out2, out1, out1) ones1 = FF.ones_like(out10) ones2 = FF.ones_like(input) # Enhanced output of 1, based on memory of 2 out122 = out1 * (ones1 + ratt122) # Enhanced output of 2, based on memory of 1 out211 = out2 * (ones1 + ratt211) out12 = self.collect(out122, out211) # includes relu, it's for fusion out_res = (input + out12) * (ones2 + att) return out_res
def test_setitem_autograd(np_array, index): """ np_array: native numpy array. """ x = np.array(np_array, dtype=np_array.dtype) out_shape = x[index].shape y = np.array(_np.random.uniform(size=out_shape)) y.attach_grad() try: with mx.autograd.record(): x[index] = y x.backward() y_grad = np.ones_like(y) assert same(y_grad.asnumpy(), y.grad.asnumpy()) except mx.base.MXNetError as err: assert str(err).find('Inplace operations (+=, -=, x[:]=, etc) are not supported when recording with') != -1
def forward(self, input_t1, input_t2): # These inputs must have the same dimensionality , t1, t2 relatt12 = self.gamma1.data() * self.relatt12(input_t1, input_t2, input_t2) relatt21 = self.gamma2.data() * self.relatt21(input_t2, input_t1, input_t1) ones = FF.ones_like(input_t1) # Enhanced output of 1, based on memory of 2 out12 = input_t1 * (ones + relatt12) # Enhanced output of 2, based on memory of 1 out21 = input_t2 * (ones + relatt21) fuse = self.fuse(FF.concatenate([out12, out21], axis=1)) fuse = FFx.relu(fuse) return fuse
def gen_self_attn_mask(data, valid_length=None, dtype: type = np.float32, attn_type: str = 'full', layout: str = 'NT'): """Generate the mask used for the encoder, i.e, self-attention. In our implementation, 1 --> not masked, 0 --> masked Let's consider the data with two samples: .. code-block:: none data = [['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP' ], ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>']] valid_length = [8, 6] - attn_type = 'causal' Each token will attend to itself + the tokens before. It will not attend to tokens in the future. For our example, the mask of the first sample is .. code-block:: none ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP'] 'I': 1, 0, 0, 0, 0, 0, 0, 0 'can': 1, 1, 0, 0, 0, 0, 0, 0 'now': 1, 1, 1, 0, 0, 0, 0, 0 'use': 1, 1, 1, 1, 0, 0, 0, 0 'numpy': 1, 1, 1, 1, 1, 0, 0, 0 'in': 1, 1, 1, 1, 1, 1, 0, 0 'Gluon@@': 1, 1, 1, 1, 1, 1, 1, 0 'NLP': 1, 1, 1, 1, 1, 1, 1, 1 The mask of the second sample is .. code-block:: none ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>'] 'May': 1, 0, 0, 0, 0, 0, 0, 0 'the': 1, 1, 0, 0, 0, 0, 0, 0 'force': 1, 1, 1, 0, 0, 0, 0, 0 'be': 1, 1, 1, 1, 0, 0, 0, 0 'with': 1, 1, 1, 1, 1, 0, 0, 0 'you': 1, 1, 1, 1, 1, 1, 0, 0 '<PAD>': 0, 0, 0, 0, 0, 0, 0, 0 '<PAD>': 0, 0, 0, 0, 0, 0, 0, 0 - attn_type = 'full' Each token will attend to both the tokens before and in the future For our example, the mask of the first sample is .. code-block:: none ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP'] 'I': 1, 1, 1, 1, 1, 1, 1, 1 'can': 1, 1, 1, 1, 1, 1, 1, 1 'now': 1, 1, 1, 1, 1, 1, 1, 1 'use': 1, 1, 1, 1, 1, 1, 1, 1 'numpy': 1, 1, 1, 1, 1, 1, 1, 1 'in': 1, 1, 1, 1, 1, 1, 1, 1 'Gluon@@': 1, 1, 1, 1, 1, 1, 1, 1 'NLP': 1, 1, 1, 1, 1, 1, 1, 1 The mask of the second sample is .. code-block:: none ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>'] 'May': 1, 1, 1, 1, 1, 1, 0, 0 'the': 1, 1, 1, 1, 1, 1, 0, 0 'force': 1, 1, 1, 1, 1, 1, 0, 0 'be': 1, 1, 1, 1, 1, 1, 0, 0 'with': 1, 1, 1, 1, 1, 1, 0, 0 'you': 1, 1, 1, 1, 1, 1, 0, 0 '<PAD>': 0, 0, 0, 0, 0, 0, 0, 0 '<PAD>': 0, 0, 0, 0, 0, 0, 0, 0 Parameters ---------- data The data. - layout = 'NT' Shape (batch_size, seq_length, C) - layout = 'TN' Shape (seq_length, batch_size, C) valid_length Shape (batch_size,) dtype Data type of the mask attn_type Can be 'full' or 'causal' layout The layout of the data Returns ------- mask Shape (batch_size, seq_length, seq_length) """ if layout == 'NT': batch_axis, time_axis = 0, 1 elif layout == 'TN': batch_axis, time_axis = 1, 0 else: raise NotImplementedError('Unsupported layout={}'.format(layout)) if attn_type == 'full': if valid_length is not None: valid_length = valid_length.astype(dtype) steps = npx.arange_like(data, axis=time_axis) # (seq_length,) mask1 = (npx.reshape(steps, (1, 1, -1)) < npx.reshape(valid_length, (-2, 1, 1))) mask2 = (npx.reshape(steps, (1, -1, 1)) < npx.reshape(valid_length, (-2, 1, 1))) mask = mask1 * mask2 else: # TODO(sxjscience) optimize seq_len_ones = np.ones_like(npx.arange_like(data, axis=time_axis)) # (seq_length,) batch_ones = np.ones_like(npx.arange_like(data, axis=batch_axis)) # (batch_size,) mask = batch_ones.reshape((-1, 1, 1)) * seq_len_ones.reshape((1, -1, 1))\ * seq_len_ones.reshape((1, 1, -1)) elif attn_type == 'causal': steps = npx.arange_like(data, axis=time_axis) # mask: (seq_length, seq_length) # batch_mask: (batch_size, seq_length) mask = (np.expand_dims(steps, axis=0) <= np.expand_dims(steps, axis=1)).astype(dtype) if valid_length is not None: valid_length = valid_length.astype(dtype) batch_mask = (np.expand_dims(steps, axis=0) < np.expand_dims(valid_length, axis=-1)).astype(dtype) mask = mask * np.expand_dims(batch_mask, axis=-1) else: batch_ones = np.ones_like(npx.arange_like(data, axis=batch_axis), dtype=dtype) # (batch_size,) mask = mask * batch_ones.reshape((-1, 1, 1)) else: raise NotImplementedError return mask.astype(np.bool)
def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None, dtype=np.float32, layout: str = 'NT'): """Generate the mask used for the decoder. All query slots are attended to the memory slots. In our implementation, 1 --> not masked, 0 --> masked Let's consider the data + mem with a batch of two samples: .. code-block:: none mem = [['I', 'can', 'now', 'use'], ['May', 'the', 'force', '<PAD>']] mem_valid_length = [4, 3] data = [['numpy', 'in', 'Gluon@@', 'NLP' ], ['be', 'with', 'you', '<PAD>']] data_valid_length = [4, 3] For our example, the mask of the first sample is .. code-block:: none ['I', 'can', 'now', 'use'] 'numpy': 1, 1, 1, 1 'in': 1, 1, 1, 1 'Gluon@@': 1, 1, 1, 1 'NLP': 1, 1, 1, 1 The mask of the second sample is .. code-block:: none ['be', 'with', 'you', '<PAD>'] 'May': 1, 1, 1, 0 'the': 1, 1, 1, 0 'force': 1, 1, 1, 0 '<PAD>': 0, 0, 0, 0 Parameters ---------- mem - layout = 'NT' Shape (batch_size, mem_length, C_mem) - layout = 'TN' Shape (mem_length, batch_size, C_mem) mem_valid_length : Shape (batch_size,) data - layout = 'NT' Shape (batch_size, query_length, C_data) - layout = 'TN' Shape (query_length, batch_size, C_data) data_valid_length : Shape (batch_size,) dtype Data type of the mask layout Layout of the data + mem tensor Returns ------- mask : Shape (batch_size, query_length, mem_length) """ if layout == 'NT': batch_axis, time_axis = 0, 1 elif layout == 'TN': batch_axis, time_axis = 1, 0 else: raise NotImplementedError('Unsupported layout={}'.format(layout)) mem_valid_length = mem_valid_length.astype(dtype) mem_steps = npx.arange_like(mem, axis=time_axis) # (mem_length,) data_steps = npx.arange_like(data, axis=time_axis) # (query_length,) mem_mask = (npx.reshape(mem_steps, (1, 1, -1)) < npx.reshape(mem_valid_length, (-2, 1, 1))).astype(dtype) # (B, 1, mem_length) if data_valid_length is not None: data_valid_length = data_valid_length.astype(dtype) data_mask = (npx.reshape(data_steps, (1, -1, 1)) < npx.reshape(data_valid_length, (-2, 1, 1))).astype(dtype) # (B, query_length, 1) mask = mem_mask * data_mask else: query_length_ones = np.ones_like(data_steps) mask = query_length_ones.reshape((1, -1, 1)) * mem_mask return mask.astype(np.bool)
def get_corrupted_tokens(self, inputs, original_tokens, masked_positions, logits): """ Sample from the generator to create corrupted input. Parameters ---------- F inputs The masked input - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) original_tokens The original tokens that appear in the unmasked input sequence Shape (batch_size, num_masked_positions). masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). logits The logits of each tokens Shape (batch_size, num_masked_positions, vocab_size) Returns ------- corrupted_tokens Shape (batch_size, ) fake_data - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) labels - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) """ if self._disallow_correct: # TODO(sxjscience), Revise the implementation disallow = npx.one_hot(masked_positions, depth=self.vocab_size, dtype=self._dtype) logits = logits - 1000.0 * disallow # gumbel_softmax() samples from the logits with a noise of Gumbel distribution prob = gumbel_softmax( F, logits, temperature=self._temperature, eps=self._gumbel_eps, use_np_gumbel=False) corrupted_tokens = np.argmax(prob, axis=-1).astype(np.int32) if self.disc_backbone.layout == 'TN': inputs = inputs.T original_data = update_vectors_by_position(F, inputs, original_tokens, masked_positions) fake_data = update_vectors_by_position(F, inputs, corrupted_tokens, masked_positions) updates_mask = add_vectors_by_position(np.zeros_like(inputs), np.ones_like(masked_positions), masked_positions) # Dealing with multiple zeros in masked_positions which # results in a non-zero value in the first index [CLS] updates_mask = np.minimum(updates_mask, 1) labels = updates_mask * np.not_equal(fake_data, original_data) if self.disc_backbone.layout == 'TN': return corrupted_tokens, fake_data.T, labels.T else: return corrupted_tokens, fake_data, labels
def forward(self, pred, label, valid_len): # weights shape: (batch_size, seq_len, 1) weights = np.expand_dims(np.ones_like(label), axis=-1) weights = npx.sequence_mask(weights, valid_len, True, axis=1) return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)
def dynamic_masking(self, input_ids, valid_lengths): # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked` # that control the masking status for each positions in the sequence. """ Generate masking positions on-the-fly instead of during preprocessing Parameters ---------- input_ids The batchified input_ids with shape (batch_size, max_seq_length) valid_lengths The batchified valid_lengths with shape (batch_size, ) Returns ------ masked_input_ids The masked input sequence with 15% tokens are masked with [MASK] shape (batch_size, max_seq_length) length_masks The masking matrix for the whole sequence that indicates the positions are greater than valid_length. shape (batch_size, max_seq_length) unmasked_tokens The original tokens that appear in the unmasked input sequence shape (batch_size, num_masked_positions) masked_positions The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions) shape (batch_size, num_masked_positions) masked_lm_weights The weight matrix containing 0 or 1 to mark the actual effect of masked positions shape (batch_size, num_masked_positions) """ N = self._max_num_masked_position # Only valid token without special token are allowed to mask valid_candidates = np.ones_like(input_ids, dtype=np.bool) ignore_tokens = [ self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id ] for ignore_token in ignore_tokens: # TODO(zheyuye), Update when operation += supported valid_candidates = valid_candidates * \ np.not_equal(input_ids, ignore_token) valid_lengths = valid_lengths.astype(np.float32) valid_candidates = valid_candidates.astype(np.float32) num_masked_position = mxnp.maximum( 1, np.minimum(N, round(valid_lengths * self._mask_prob))) # Get the masking probability of each position sample_probs = self._proposal_distribution * valid_candidates sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True) sample_probs = npx.stop_gradient(sample_probs) gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs)) # Following the instruction of official repo to avoid deduplicate postions # with Top_k Sampling as https://github.com/google-research/electra/issues/41 masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels, k=N, axis=-1, ret_typ='indices', dtype=np.int32) masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions), sequence_length=num_masked_position, use_sequence_length=True, axis=1, value=0) masked_positions = masked_positions * masked_weights length_masks = npx.sequence_mask(mxnp.ones_like(input_ids, dtype=np.float32), sequence_length=valid_lengths, use_sequence_length=True, axis=1, value=0) unmasked_tokens = select_vectors_by_position( input_ids, masked_positions) * masked_weights masked_weights = masked_weights.astype(np.float32) replaced_positions = (mxnp.random.uniform( mxnp.zeros_like(masked_positions), mxnp.ones_like( masked_positions)) < self._replace_prob) * masked_positions # dealing with multiple zero values in replaced_positions which causes # the [CLS] being replaced filled = mxnp.where(replaced_positions, self.vocab.mask_id, self.vocab.cls_id).astype(np.int32) # Masking token by replacing with [MASK] masked_input_ids = update_vectors_by_position(input_ids, filled, replaced_positions) # Note: It is likely have multiple zero values in masked_positions if number of masked of # positions not reached the maximum. However, this example hardly exists since valid_length # is almost always equal to max_seq_length masked_input = self.MaskedInput(input_ids=masked_input_ids, masks=length_masks, unmasked_tokens=unmasked_tokens, masked_positions=masked_positions, masked_weights=masked_weights) return masked_input
def test_ones_like(): inp = np.ones((2, INT_OVERFLOW)) out = np.ones_like(inp) assert out.shape == inp.shape assert out[0, 0] == 1 and out[-1, -1] == 1