def test_to_tensor(): # 3D Input data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8) out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8')) assert_almost_equal( out_nd.asnumpy(), np.transpose(data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1))) # 4D Input data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8) out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8')) assert_almost_equal( out_nd.asnumpy(), np.transpose(data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2))) # Invalid Input invalid_data_in = np.random.uniform( 0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8) transformer = transforms.ToTensor() assertRaises(MXNetError, transformer, invalid_data_in) # Bounds (0->0, 255->1) data_in = np.zeros((10, 20, 3)).astype(dtype=np.uint8) out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8')) assert same( out_nd.asnumpy(), np.transpose(np.zeros(data_in.shape, dtype=np.float32), (2, 0, 1))) data_in = np.full((10, 20, 3), 255).astype(dtype=np.uint8) out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8')) assert same( out_nd.asnumpy(), np.transpose(np.ones(data_in.shape, dtype=np.float32), (2, 0, 1)))
def init_state_from_encoder( self, encoder_outputs: np.ndarray, encoder_valid_length: Optional[np.ndarray] = None, target_embed: Optional[np.ndarray] = None) -> List[np.ndarray]: """ Returns the initial states given encoder output. States for teacher-forced training are encoder outputs and a valid length mask for encoder outputs. At inference, this method returns the following state tuple: valid length bias, step state, [projected encoder attention keys, projected encoder attention values] * num_layers, [autoregressive state dummies] * num_layers. :param encoder_outputs: Encoder outputs. Shape: (batch, source_length, encoder_dim). :param encoder_valid_length: Valid lengths of encoder outputs. Shape: (batch,). :param target_embed: Target-side embedding layer output. Shape: (batch, target_length, target_embedding_dim). :return: Initial states. """ if target_embed is None: # Inference: initial step = 0. Shape: (batch_size, 1) steps = np.expand_dims(np.zeros_like(encoder_valid_length), axis=1) else: # Training: steps up to target length. Shape: (1, target_length) steps = np.expand_dims(npx.arange_like(target_embed, axis=1), axis=0) if self.inference_only: # Encoder projection caching, therefore we don't pass the encoder_outputs states = [steps, encoder_valid_length] for layer in self.layers: enc_att_kv = layer.enc_attention.ff_kv(encoder_outputs) states.append(np.transpose(enc_att_kv, axes=(1, 0, 2))) else: # NO encoder projection caching states = [ steps, np.transpose(encoder_outputs, axes=(1, 0, 2)), encoder_valid_length ] _batch_size = encoder_outputs.shape[0] _ctx = encoder_outputs.ctx _dtype = encoder_outputs.dtype dummy_autoregr_states = [ np.zeros(layer.get_states_shape(_batch_size), ctx=_ctx, dtype=_dtype) for layer in self.layers for _ in range(layer.num_state_tensors) ] states += dummy_autoregr_states return states
def test_np_transpose(): # TODO(junwu): Add more test cases data = mx.sym.var('a').as_np_ndarray() ret = data.transpose() assert type(ret) == mx.sym.np._Symbol dtypes = ['float32', 'int32'] for dtype in dtypes: for ndim in [0, 1, 2, 3, 4, 5, 6]: shape = rand_shape_nd(ndim, dim=5, allow_zero_size=True) np_data = _np.random.uniform(low=-100, high=100, size=shape).astype(dtype) mx_data = np.array(np_data, dtype=dtype) axes = [None] if ndim == 0: axes += [()] else: axis = [i for i in range(ndim)] axes.append(tuple(axis)) random.shuffle(axis) axes.append(tuple(axis)) for axis in axes: np_out = _np.transpose(np_data, axes=axis) mx_out = np.transpose(mx_data, axes=axis) assert np_out.dtype == mx_out.dtype assert same(mx_out.asnumpy(), np_out)
def test_np_transpose(): def np_transpose_grad(out_shape, dtype, axes=None): ograd = _np.ones(out_shape, dtype=dtype) if axes is None or axes == (): return _np.transpose(ograd, axes) np_axes = _np.array(list(axes)) return _np.transpose(ograd, tuple(list(_np.argsort(np_axes)))) class TestTranspose(HybridBlock): def __init__(self, axes=None): super(TestTranspose, self).__init__() self.axes = axes def hybrid_forward(self, F, a): return F.np.transpose(a, self.axes) for hybridize in [True, False]: for dtype in [_np.int32, _np.float32]: for ndim in range(7): shape = rand_shape_nd(ndim, dim=5, allow_zero_size=True) axeses = [None] if ndim == 0: axeses += [()] else: axes = [i for i in range(ndim)] axeses.append(tuple(axes)) random.shuffle(axes) axeses.append(tuple(axes)) for axes in axeses: test_trans = TestTranspose(axes) if hybridize: test_trans.hybridize() x = rand_ndarray(shape).as_np_ndarray() x = x.astype(dtype) x.attach_grad() np_out = _np.transpose(x.asnumpy(), axes) with mx.autograd.record(): mx_out = test_trans(x) assert mx_out.shape == np_out.shape assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False) mx_out.backward() np_backward = np_transpose_grad(np_out.shape, dtype, axes) assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5, use_broadcast=False) mx_out = np.transpose(x, axes) np_out = _np.transpose(x.asnumpy(), axes) assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, use_broadcast=False)
def forward(self, data, valid_length): # positional embedding data = self.pos_embedding(data, None) if self.config.dropout_prepost > 0.0: data = npx.dropout(data=data, p=self.config.dropout_prepost) # (batch_size * heads, seq_len) att_valid_length = layers.prepare_source_valid_lengths(valid_length, data, num_heads=self.config.attention_heads) data = np.transpose(data, axes=(1, 0, 2)) for block in self.layers: data = block(data, att_valid_length) data = self.final_process(data, None) data = np.transpose(data, axes=(1, 0, 2)) return data, valid_length
def forward(self, rel_positions, query=None): """Forward function Parameters ---------- rel_positions The relative shifts. Shape (query_length, mem_length). Each element represents the shift between the :math:`i-th` element of query and the :math:`j-th` element of memory. query The query for computing the relative scores. The shape depends on the layout. If we use T5 attention, the query will not be used. Returns ------- rel_scores The relative attention scores Can have shape (batch_size, num_heads, query_length, mem_length) or (num_heads, query_length, mem_length) """ if self._method == 'transformer_xl' or self._method == 'shaw': assert query is not None, 'Must specify query if method={}'.format(self._method) if self._bidirectional: if self._max_distance is not None: rel_positions = np.clip(rel_positions, a_min=-self._max_distance, a_max=self._max_distance) else: if self._max_distance is not None: rel_positions = np.clip(rel_positions, a_min=0, a_max=self._max_distance) # uniq_rel.shape = (#uniq,), rev_index.shape = (L_q, L_m) uniq_rel, rev_index = np.unique(rel_positions, return_inverse=True) uniq_rel_pos_embed = self._rel_pos_embed(uniq_rel) if self._method == 'transformer_xl': uniq_rel_pos_embed = self._rel_proj(self._dropout_layer(uniq_rel_pos_embed)) # Shape (#uniq, K, C_q) uniq_rel_pos_embed = npx.reshape(uniq_rel_pos_embed, (-2, self._num_heads, self._head_query_units)) # Calculate the dot-product between query and the relative positional embeddings. # After the calculation, rel_score.shape = (L_q, #uniq, N, K) if self._layout == 'NKT': # query_for_rel: (N, K, L_q, C_q) if self._use_einsum: rel_score = np.einsum('bnid,jnd->ijbn', query, uniq_rel_pos_embed) else: rel_score = np.transpose( np.matmul(query, np.transpose(uniq_rel_pos_embed, (1, 2, 0))), (2, 3, 0, 1) ) elif self._layout == 'NTK': # query_for_rel: (N, L_q, K, C_q) if self._use_einsum: rel_score = np.einsum('bind,jnd->ijbn', query, uniq_rel_pos_embed) else: rel_score = np.transpose( np.matmul(np.swapaxes(query, 1, 2), np.transpose(uniq_rel_pos_embed, (1, 2, 0))), (2, 3, 0, 1) ) elif self._layout == 'TNK': # query_for_rel: (L_q, N, K, C_q) if self._use_einsum: rel_score = np.einsum('ibnd,jnd->ijbn', query, uniq_rel_pos_embed) else: rel_score = np.transpose( np.matmul(np.transpose(query, (1, 2, 0, 3)), np.transpose(uniq_rel_pos_embed, (1, 2, 0))), (2, 3, 0, 1) ) else: raise NotImplementedError # We use gather_nd to select the elements # TODO(sxjscience) Use advanced indexing once available rev_index = npx.reshape_like(rev_index, rel_positions).astype(np.int32) query_idx = np.expand_dims(npx.arange_like(rel_positions, axis=0).astype(np.int32), axis=-1) + np.zeros_like(rev_index) rel_score = npx.gather_nd(rel_score, np.stack([query_idx, rev_index])) rel_score = np.transpose(rel_score, (2, 3, 0, 1)) elif self._method == 't5': # shape is (K, L_q, L_m) rel_score = self._rel_pos_embed(rel_positions).transpose((2, 0, 1)) else: raise NotImplementedError return rel_score
def forward( self, step_input: np.ndarray, states: List[np.ndarray]) -> Tuple[np.ndarray, List[np.ndarray]]: mask = None if self.inference_only: steps, source_valid_length, *other = states source_encoded = None # use constant pre-computed key value projections from the states enc_att_kv = other[:self.config.num_layers] autoregr_states = other[self.config.num_layers:] else: if any(layer.needs_mask for layer in self.layers): mask = self.autoregressive_bias( step_input) # mask: (1, length, length) steps, source_encoded, source_valid_length, *autoregr_states = states enc_att_kv = [None for _ in range(self.config.num_layers)] if any(layer.num_state_tensors > 1 for layer in self.layers): # separates autoregressive states by layer states_iter = iter(autoregr_states) autoregr_states = [ list(islice(states_iter, 0, layer.num_state_tensors)) for layer in self.layers ] # (batch_size * heads, query_length) source_valid_length = layers.prepare_source_valid_lengths( source_valid_length, step_input, num_heads=self.config.attention_heads) # target: (batch_size, length, model_size) target = self.pos_embedding(step_input, steps) # (length, batch_size, model_size) target = np.transpose(target, axes=(1, 0, 2)) if self.config.dropout_prepost > 0.0: target = npx.dropout(data=target, p=self.config.dropout_prepost) new_autoregr_states = [] for layer, layer_autoregr_state, layer_enc_att_kv in zip( self.layers, autoregr_states, enc_att_kv): target, new_layer_autoregr_state = layer(target, mask, source_encoded, source_valid_length, layer_autoregr_state, layer_enc_att_kv) new_autoregr_states += [*new_layer_autoregr_state] target = self.final_process(target, None) target = np.transpose(target, axes=(1, 0, 2)) # Inference: increment steps by 1 (discarded in training) steps = steps + 1 if self.inference_only: # pass in cached encoder states encoder_attention_keys_values = states[2:2 + self.config.num_layers] new_states = [ steps, states[1] ] + encoder_attention_keys_values + new_autoregr_states else: encoder_outputs = states[1] encoder_valid_length = states[2] new_states = [steps, encoder_outputs, encoder_valid_length ] + new_autoregr_states return target, new_states