def forward(self, best_hyp_indices, best_word_indices, finished, scores_accumulated, lengths, reference_lengths, factors=None): # Reorder fixed-size beam data according to best_hyp_indices (ascending) finished = np.take(finished, best_hyp_indices, axis=0) lengths = np.take(lengths, best_hyp_indices, axis=0) reference_lengths = np.take(reference_lengths, best_hyp_indices, axis=0) # Normalize hypotheses that JUST finished all_finished = np.expand_dims(np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id), axis=1) newly_finished = np.logical_xor(all_finished, finished) scores_accumulated = np.where(newly_finished, self._scorer(scores_accumulated, npx.cast(lengths, self.dtype), reference_lengths), scores_accumulated) # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos> finished = np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id) finished = npx.cast(np.expand_dims(finished, axis=1), 'int32') # Concatenate sorted secondary target factors to best_word_indices. Shape: (batch*beam, num_factors) best_word_indices = np.expand_dims(best_word_indices, axis=1) if factors is not None: secondary_factors = np.take(factors, best_hyp_indices, axis=0) best_word_indices = np.concatenate((best_word_indices, secondary_factors), axis=1) return best_word_indices, finished, scores_accumulated, lengths, reference_lengths
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False): """Predict for sequence to sequence.""" src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['<eos>']] enc_valid_len = np.array([len(src_tokens)], ctx=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0) enc_outputs = net.encoder(enc_X, enc_valid_len) dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0) output_seq, attention_weight_seq = [], [] for _ in range(num_steps): Y, dec_state = net.decoder(dec_X, dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step dec_X = Y.argmax(axis=2) pred = dec_X.squeeze(axis=0).astype('int32').item() # Save attention weights (to be covered later) if save_attention_weights: attention_weight_seq.append(net.decoder.attention_weights) # Once the end-of-sequence token is predicted, the generation of the # output sequence is complete if pred == tgt_vocab['<eos>']: break output_seq.append(pred) return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, device): """Predict sequences (defined in Chapter 9).""" #src_tokens = src_vocab[src_sentence.lower().split(' ')] + [src_vocab['<eos>']] src_tokens = src_vocab[get_word_list( src_sentence.lower())] + [src_vocab['<eos>']] enc_valid_len = np.array([len(src_tokens)], ctx=device) src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0) enc_outputs = model.encoder(enc_X, enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0) output_seq = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step dec_X = Y.argmax(axis=2) pred = dec_X.squeeze(axis=0).astype('int32').item() # Once the end-of-sequence token is predicted, the generation of # the output sequence is complete if pred == tgt_vocab['<eos>']: break output_seq.append(pred) return ' '.join(tgt_vocab.to_tokens(output_seq))
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, ctx): # ORG src_tokens = src_vocab[src_sentence.lower().split(' ')] # TODO : if vocab doesn't contain, instead of 0, assign other index src_tokens = src_vocab[src_sentence.lower().split(',')] num_steps = len(src_tokens) # Fix enc_valid_len = np.array([len(src_tokens)], ctx=ctx) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) enc_X = np.array(src_tokens, ctx=ctx) # Add the batch_size dimension enc_outputs = model.encoder(np.expand_dims(enc_X, axis=0), enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=ctx), axis=0) predict_tokens = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next timestep input dec_X = Y.argmax(axis=2) py = dec_X.squeeze(axis=0).astype('int32').item() #print("debug : ", py) if py == tgt_vocab['<eos>']: print('py : ', py) #ToDo break elif py == tgt_vocab['<unk>']: print('py : ', py) #ToDo break predict_tokens.append(py) return ' '.join(tgt_vocab.to_tokens(predict_tokens))
def forward(self, positions): """ Parameters ---------- positions : NDArray Shape (..., ) Returns ------- ret : Shape (..., units) """ emb = np.expand_dims(positions.astype(self._dtype), axis=-1) * self.base_mult.data() sin_emb = np.sin(emb) cos_emb = np.cos(emb) if self._units % 2 == 0: return np.concatenate([sin_emb, cos_emb], axis=-1) else: return np.concatenate([ sin_emb, cos_emb, np.expand_dims(np.zeros_like(positions).astype(self._dtype), axis=-1) ], axis=-1)
def get_end_logits(self, contextual_embedding, start_positions, p_mask): """ Parameters ---------- contextual_embedding Shape (batch_size, sequence_length, C) start_positions Shape (batch_size, N) We process multiple candidates simultaneously p_mask Shape (batch_size, sequence_length) Returns ------- end_logits Shape (batch_size, N, sequence_length) """ # Select the features at the start_positions # start_feature will have shape (batch_size, N, C) start_features = select_vectors_by_position(contextual_embedding, start_positions) # Concatenate the start_feature and the contextual_embedding contextual_embedding = np.expand_dims(contextual_embedding, axis=1) # (B, 1, T, C) start_features = np.expand_dims(start_features, axis=2) # (B, N, 1, C) concat_features = np.concatenate([npx.broadcast_like(start_features, contextual_embedding, 2, 2), npx.broadcast_like(contextual_embedding, start_features, 1, 1)], axis=-1) # (B, N, T, 2C) end_scores = self.end_scores(concat_features) end_scores = np.squeeze(end_scores, -1) end_logits = masked_logsoftmax(end_scores, mask=np.expand_dims(p_mask, axis=1), axis=-1) return end_logits
def get_bert_encoding(net, tokens_a, tokens_b=None): tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b) ctx = d2l.try_gpu() token_ids = np.expand_dims(np.array(vocab[tokens], ctx=ctx), axis=0) segments = np.expand_dims(np.array(segments, ctx=ctx), axis=0) valid_len = np.expand_dims(np.array(len(tokens), ctx=ctx), axis=0) encoded_X, _, _ = net(token_ids, segments, valid_len) return encoded_X
def _get_relative_position(self, hidden_states): query_position = np.expand_dims(npx.arange_like(hidden_states, axis=self.time_axis), axis=-1) mem_position = np.expand_dims(npx.arange_like(hidden_states, axis=self.time_axis), axis=0) relative_position = mem_position - query_position return relative_position.astype(np.int32)
def test_expand_dims(): inp = np.zeros((INT_OVERFLOW)) inp[-1] = 1 out1 = np.expand_dims(inp, axis=0) out2 = np.expand_dims(out1, axis=2) assert out1.shape == (1, INT_OVERFLOW) assert out2.shape == (1, INT_OVERFLOW, 1) assert out1[0, -1] == 1 assert out2[0, -1, 0] == 1
def forward(self, x: np.ndarray) -> np.ndarray: # Shape: (length, 1) length_array = npx.arange_like(x, axis=1) # matrix with lower triangle and main diagonal set to 0, upper triangle set to 1 # Shape: (length, length) bias = npx.broadcast_greater(np.expand_dims(length_array, axis=0), np.expand_dims(length_array, axis=1)) bias = bias * -C.LARGE_VALUES[self._dtype] bias = np.expand_dims(bias, axis=0) return npx.stop_gradient(bias)
def init_state_from_encoder( self, encoder_outputs: np.ndarray, encoder_valid_length: Optional[np.ndarray] = None, target_embed: Optional[np.ndarray] = None) -> List[np.ndarray]: """ Returns the initial states given encoder output. States for teacher-forced training are encoder outputs and a valid length mask for encoder outputs. At inference, this method returns the following state tuple: valid length bias, step state, [projected encoder attention keys, projected encoder attention values] * num_layers, [autoregressive state dummies] * num_layers. :param encoder_outputs: Encoder outputs. Shape: (batch, source_length, encoder_dim). :param encoder_valid_length: Valid lengths of encoder outputs. Shape: (batch,). :param target_embed: Target-side embedding layer output. Shape: (batch, target_length, target_embedding_dim). :return: Initial states. """ if target_embed is None: # Inference: initial step = 0. Shape: (batch_size, 1) steps = np.expand_dims(np.zeros_like(encoder_valid_length), axis=1) else: # Training: steps up to target length. Shape: (1, target_length) steps = np.expand_dims(npx.arange_like(target_embed, axis=1), axis=0) if self.inference_only: # Encoder projection caching, therefore we don't pass the encoder_outputs states = [steps, encoder_valid_length] for layer in self.layers: enc_att_kv = layer.enc_attention.ff_kv(encoder_outputs) states.append(np.transpose(enc_att_kv, axes=(1, 0, 2))) else: # NO encoder projection caching states = [ steps, np.transpose(encoder_outputs, axes=(1, 0, 2)), encoder_valid_length ] _batch_size = encoder_outputs.shape[0] _ctx = encoder_outputs.ctx _dtype = encoder_outputs.dtype dummy_autoregr_states = [ np.zeros(layer.get_states_shape(_batch_size), ctx=_ctx, dtype=_dtype) for layer in self.layers for _ in range(layer.num_state_tensors) ] states += dummy_autoregr_states return states
def decode_step(self, step_input: np.ndarray, states: List[np.ndarray], vocab_slice_ids: Optional[np.ndarray] = None): outputs = [] # type: List[np.ndarray] new_states = [] # type: List[np.ndarray] factor_outputs = [] # type: List[List[np.ndarray]] state_index = 0 for model, model_state_structure in zip(self._models, self.state_structure()): model_states = states[state_index:state_index+len(model_state_structure)] state_index += len(model_state_structure) logits, model_states, target_factor_outputs = model.decode_step(step_input, model_states, vocab_slice_ids) probs = npx.softmax(logits, axis=-1, temperature=self._softmax_temperature) outputs.append(probs) target_factor_probs = [npx.softmax(tfo, axis=-1) for tfo in target_factor_outputs] factor_outputs.append(target_factor_probs) new_states += model_states scores = self._interpolation(outputs) target_factors = None # type: Optional[np.ndarray] if factor_outputs: # target factors are greedily 'decoded'. factor_predictions = [npx.cast(np.expand_dims(np.argmin(self._interpolation(fs), axis=-1), axis=1), dtype='int32') for fs in zip(*factor_outputs)] if factor_predictions: target_factors = factor_predictions[0] if len(factor_predictions) == 1 \ else np.concatenate(factor_predictions, axis=1) return scores, new_states, target_factors
def _training_cell_state_transform( previous_cell_state, weighted_inputs, forget_rates) -> Tuple[np.ndarray, np.ndarray]: """Update SSRU cell at training time""" def _time_step_update( step_input_and_forget_rate, previous_step_state) -> Tuple[np.ndarray, np.ndarray]: """ Recurrently update the SSRU cell state for one time step. :param step_input_and_forget_rate: List = [step_input, forget_rate] :param previous_step_state: cell state at (t-1) :return: twice the current time step state. NOTE: The first instance will be stacked in the final foreach output and the second will be the input to the next time_step_update iteration. """ step_input, forget_rate = step_input_and_forget_rate # each of shape (batch_size, model_size) current_step_state = forget_rate * previous_step_state + step_input return current_step_state, current_step_state # (max_length, batch, input_depth), (batch, input_depth) cell_state, last_step_state = npx.foreach( _time_step_update, [weighted_inputs, forget_rates], np.squeeze(previous_cell_state, axis=0)) return cell_state, np.expand_dims(last_step_state, axis=0)
def get_answerable_logits(self, contextual_embedding, p_mask): """Get the answerable logits. Parameters ---------- contextual_embedding Shape (batch_size, sequence_length, C) p_mask Shape (batch_size, sequence_length) Mask the sequence. 0 --> Denote that the element is masked, 1 --> Denote that the element is not masked Returns ------- answerable_logits Shape (batch_size, 2) """ # Shape (batch_size, sequence_length) start_scores = np.squeeze(self.start_scores(contextual_embedding), -1) start_score_weights = masked_softmax(start_scores, p_mask, axis=-1) start_agg_feature = npx.batch_dot(np.expand_dims(start_score_weights, axis=1), contextual_embedding) start_agg_feature = np.squeeze(start_agg_feature, 1) cls_feature = contextual_embedding[:, 0, :] answerable_scores = self.answerable_scores(np.concatenate([start_agg_feature, cls_feature], axis=-1)) answerable_logits = npx.log_softmax(answerable_scores, axis=-1) return answerable_logits
def multibox_target(anchors, labels): batch_size, anchors = labels.shape[0], anchors.squeeze(0) batch_offset, batch_mask, batch_class_labels = [], [], [] device, num_anchors = anchors.ctx, anchors.shape[0] print(labels.shape) print(batch_size) for i in range(batch_size): label = labels[i, :, :] anchors_bbox_map = match_anchor_to_bbox(label[:, 1:], anchors, device) # [-1, 0, 1, -1 , 1] bbox_mask = np.tile((np.expand_dims((anchors_bbox_map >= 0), axis=-1)), (1, 4)).astype('int32') # Initialize class_labels and assigned bbox coordinates with zeros class_labels = np.zeros(num_anchors, dtype=np.int32, ctx=device) assigned_bb = np.zeros((num_anchors, 4), dtype=np.float32, ctx=device) # Assign class labels to the anchor boxes using matched gt bbox labels # If no gt bbox is assigned to an anchor box, then let the # class_labels and assigned_bb remain zero, i.e the background class indices_true = np.nonzero(anchors_bbox_map >= 0)[0] #[1,2,4] print(indices_true) bb_idx = anchors_bbox_map[indices_true] #[0, 1, 1] class_labels[indices_true] = label[bb_idx, 0].astype( 'int32') + 1 # Get category assigned_bb[indices_true] = label[bb_idx, 1:] # Get ground-truth # offset transformations offset = offset_boxes(anchors, assigned_bb) * bbox_mask batch_offset.append(offset.reshape(-1)) batch_mask.append(bbox_mask.reshape(-1)) batch_class_labels.append(class_labels) bbox_offset = np.stack(batch_offset) bbox_mask = np.stack(batch_mask) class_labels = np.stack(batch_class_labels) return (bbox_offset, bbox_mask, class_labels)
def forward(self, hidden_states, valid_length, mem_states, mem_valid_length): # 1. relative position embeddings and attention masks position_embeddings = self.relative_position_encoder( self._get_relative_position(hidden_states)) # relative position embedding is not used for cross attention, # so we just obtain the correct shape and fill it with 0 mem_relative_position = np.zeros_like( self._get_relative_position(hidden_states, mem_states)) mem_position_embeddings = np.repeat(np.expand_dims( mem_relative_position, axis=0), self._num_heads, axis=0) self_attn_mask = gen_self_attn_mask(hidden_states, valid_length, dtype=self._dtype, attn_type='causal', layout=self.layout) mem_attn_mask = gen_mem_attn_mask(mem_states, mem_valid_length, hidden_states, valid_length, dtype=self._dtype, layout=self.layout) # 2. decoder blocks and other layers hidden_states = self.dropout(hidden_states) for layer in self.layers: hidden_states = layer(hidden_states, self_attn_mask, position_embeddings, mem_states, mem_attn_mask, mem_position_embeddings) hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states
def forward(self, data, steps): # pylint: disable=arguments-differ """ Applies positional embeddings to input data. :param data: Input data. Shape: (batch, length or 1, num_embed) :param steps: Optional steps input. If given, shape is (batch_size or 1, seq_len,) :return: Data with positional embeddings added """ # (length, num_embed) if steps is None: # (batch, length, num_embed) pos_embedding = npx.slice_like(np.expand_dims(self.weight.data(), axis=0), data, axes=(1, )) else: # (batch_size or 1, seq_len, num_embed) pos_embedding = npx.embedding(steps, self.weight.data(), self.max_seq_len, self.num_embed) if self.weight_type == 'fixed': pos_embedding = npx.stop_gradient(pos_embedding) if self.scale_up_input: data = data * (self.num_embed**0.5) return data + pos_embedding
def forward(self, user_id, seq, item_id): item_embs = np.expand_dims(self.Q(seq), 1) user_emb = self.P(user_id) # (4096, 10) out, out_h, out_v, out_hs = None, None, None, [] # 横向卷积 if self.d_prime: out_v = self.conv_v(item_embs) out_v = out_v.reshape( out_v.shape[0], self.fc1_dim_v) # (4096, 4*10) # 纵向卷积 - 时间 if self.d: for conv, maxp in zip(self.conv_h, self.max_pool): # 滑动 conv_out = np.squeeze(npx.relu(conv(item_embs)), axis=3) t = maxp(conv_out) pool_out = np.squeeze(t, axis=2) out_hs.append(pool_out) out_h = np.concatenate(out_hs, axis=1) # (4096, 16*3) out = np.concatenate([out_v, out_h], axis=1) # (4096, 4*10+16*3) z = self.fc(self.dropout(out)) # (4096, 10) # 和user_emb x = np.concatenate([z, user_emb], axis=1) # (4096, 20) # 和item_emb计算 q_prime_i = np.squeeze(self.Q_prime(item_id)) # (4096, 20) b = np.squeeze(self.b(item_id)) res = (x * q_prime_i).sum(1) + b # (4096,) return res
def forward(self, tokens, token_types, valid_length, p_mask, start_position): """ Parameters ---------- tokens Shape (batch_size, sequence_length) token_types Shape (batch_size, sequence_length) valid_length Shape (batch_size,) p_mask Shape (batch_size, sequence_length) start_position Shape (batch_size,) Returns ------- start_logits Shape (batch_size, sequence_length) end_logits Shape (batch_size, sequence_length) answerable_logits """ if self.use_segmentation: contextual_embeddings = self.backbone(tokens, token_types, valid_length) else: contextual_embeddings = self.backbone(tokens, valid_length) start_logits = self.get_start_logits(contextual_embeddings, p_mask) end_logits = self.get_end_logits(contextual_embeddings, np.expand_dims(start_position, axis=1), p_mask) end_logits = np.squeeze(end_logits, axis=1) answerable_logits = self.get_answerable_logits(contextual_embeddings, p_mask) return start_logits, end_logits, answerable_logits
def gen_rel_position(data, past_data=None, dtype=np.int32, layout='NT'): """Create a matrix of relative position for RelAttentionScoreCell. The relative position is defined as the index difference: `mem_i` - `query_j`. Note, though, that the implementation here makes sense in self-attention's setting, but not in cross-attention's. Hence, both `mem_i` and `query_j` are time indices from `data` (or, in incremental decoding's case, the concatenated sequence from the current stepwise `data` and the previous steps `past_data`). Parameters ---------- data The data. Under incremental decoding, seq_length = 1. - layout = 'NT' Shape (batch_size, seq_length, C) - layout = 'TN' Shape (seq_length, batch_size, C) past_data This is only used under incremental decoding. Stacked data from previous steps. dtype Data type of the mask layout Layout of the data + past_data Returns ------- relative_position : Shape (query_length, mem_length) where query_length = mem_length = seq_length """ time_axis = 1 if layout == 'NT' else 0 if past_data is None: position = npx.arange_like(data, axis=time_axis) else: # for incremental decoding only, where past data is of the shape: # NT(NTK): (B, L_seq, num_heads, n_kv) -> (B, L_seq, inner_dim) # TN(TNK): (L_seq, B, num_heads, n_kv) -> (L_seq, B, inner_dim) past_data = npx.reshape(past_data, (-2, -2, -5)) position = npx.arange_like( np.concatenate([past_data, data], axis=time_axis), axis=time_axis ) query_position = np.expand_dims(position, axis=-1) mem_position = np.expand_dims(position, axis=0) relative_position = mem_position - query_position return relative_position.astype(np.int32) # shape (L_seq, L_seq)
def forward(self, queries, keys, values, valid_lens): queries, keys = self.W_q(queries), self.W_k(keys) # After dimension expansion, shape of `queries`: (`batch_size`, no. of # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1, # no. of key-value pairs, `num_hiddens`). Sum them up with # broadcasting features = np.expand_dims(queries, axis=2) + np.expand_dims( keys, axis=1) features = np.tanh(features) # There is only one output of `self.w_v`, so we remove the last # one-dimensional entry from the shape. Shape of `scores`: # (`batch_size`, no. of queries, no. of key-value pairs) scores = np.squeeze(self.w_v(features), axis=-1) self.attention_weights = masked_softmax(scores, valid_lens) # Shape of `values`: (`batch_size`, no. of key-value pairs, value # dimension) return npx.batch_dot(self.dropout(self.attention_weights), values)
def forward(self, X, state): enc_outputs, hidden_state, enc_valid_len = state X = self.embedding(X).swapaxes(0, 1) outputs = [] for x in X: # query shape: (batch_size, 1, num_hiddens) query = np.expand_dims(hidden_state[0][-1], axis=1) # context has same shape as query context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len) # Concatenate on the feature dimension x = np.concatenate((context, np.expand_dims(x, axis=1)), axis=-1) # Reshape x to (1, batch_size, embed_size + num_hiddens) out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state) outputs.append(out) outputs = self.dense(np.concatenate(outputs, axis=0)) return outputs.swapaxes(0, 1), [enc_outputs, hidden_state, enc_valid_len]
def forward(self, x, layer_states): """ Parameters ---------- x - layout = 'NT' Shape (batch_size, seq_length, C_in) - layout = 'TN' Shape (seq_length, batch_size, C_in) layer_states - layout = 'NT' Shape (2, batch_size, prev_len, C_in) - layout = 'TN' Shape (2, prev_len, batch_size, C_in) """ x = self.ln(x) if self._layout == 'NT': batch_axis, time_axis = 0, 1 prev_len = npx.shape_array(layer_states)[2] else: batch_axis, time_axis = 1, 0 prev_len = npx.shape_array(layer_states)[1] query, key, value = np.split(self.qkv(x), 3, axis=-1) if layer_states is not None: prev_key, prev_value = layer_states[0], layer_states[1] key = np.concatenate([prev_key, key], axis=time_axis) value = np.concatenate([prev_value, value], axis=time_axis) new_states = np.stack([key, value], axis=0) # gen mask query_pos = npx.arange_like(query, axis=time_axis) if prev_len is not None: query_pos = query_pos + prev_len key_pos = npx.arange_like(key, axis=time_axis) # (query_len, key_len) mask = (npx.reshape(key_pos, (1, -1)) <= npx.reshape(query_pos, (-1, 1))).astype( self._dtype) # broadcast to (batch_size, query_len, key_len) mask = npx.broadcast_like(np.expand_dims(mask, axis=0), query, lhs_axes=0, rhs_axes=batch_axis) query = npx.reshape(query, (-2, -2, self._num_heads, -1)) key = npx.reshape(key, (-2, -2, self._num_heads, -1)) value = npx.reshape(value, (-2, -2, self._num_heads, -1)) out, [_, attn_weight] = self.attention_cell(query, key, value, mask) out = self.out_proj(out) out = self.hidden_dropout(out) return out, new_states
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, ctx): src_tokens = src_vocab[src_sentence.lower().split(' ')] enc_valid_len = np.array([len(src_tokens)], ctx=ctx) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) enc_X = np.array(src_tokens, ctx=ctx) # Add the batch_size dimension enc_outputs = model.encoder(np.expand_dims(enc_X, axis=0), enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=ctx), axis=0) predict_tokens = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next timestep input dec_X = Y.argmax(axis=2) py = dec_X.squeeze(axis=0).astype('int32').item() if py == tgt_vocab['<eos>']: break predict_tokens.append(py) return ' '.join(tgt_vocab.to_tokens(predict_tokens))
def _get_relative_position(self, hidden_states, mem_states=None, past_key_value=None): if past_key_value is None: query_position = np.expand_dims(npx.arange_like( hidden_states, axis=self.time_axis), axis=-1) else: # for incremental decoding only, where past key and past value are of shape # NT(NTK): (B, L_seq, num_heads, n_kv); TN(TNK): (L_seq, B, num_heads, n_kv) query_position = npx.arange_like(np.concatenate( [hidden_states, past_key_value[0]], axis=self.time_axis), axis=self.time_axis) query_position = np.expand_dims(query_position, axis=-1) mem_position = np.expand_dims(npx.arange_like( hidden_states if mem_states is None else mem_states, axis=self.time_axis), axis=0) relative_position = mem_position - query_position return relative_position.astype(np.int32)
def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.00999999978): device, batch_size = cls_probs.ctx, cls_probs.shape[0] anchors = np.squeeze(anchors, axis=0) num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2] out = [] # print(offset_preds) for i in range(batch_size): cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4) conf, class_id = np.max(cls_prob[1:], 0), np.argmax(cls_prob[1:], 0) predicted_bb = offset_inverse(anchors, offset_pred) keep = nms(predicted_bb, conf, 0.5) print(keep) # Find all non_keep indices and set the class_id to background all_idx = np.arange(num_anchors, dtype=np.int32, ctx=device) combined = np.concatenate((keep, all_idx)) unique, counts = np.unique(combined, return_counts=True) print(unique, " . ", counts) non_keep = unique[counts == 1] all_id_sorted = np.concatenate((keep, non_keep)) class_id[non_keep] = -1 print(class_id) class_id = class_id[all_id_sorted].astype('float32') print(class_id) conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted] print(conf) print(predicted_bb) # threshold to be a positive prediction below_min_idx = (conf < pos_threshold) class_id[below_min_idx] = -1 conf[below_min_idx] = 1 - conf[below_min_idx] pred_info = np.concatenate((np.expand_dims( class_id, axis=1), np.expand_dims(conf, axis=1), predicted_bb), axis=1) out.append(pred_info) return np.stack(out)
def forward(self, step_data, past_states): mem_states, mem_valid_length, position, past_key_values = past_states step_hidden_states = self.model.input_embedding_layer(step_data) # NT: (B, d_model) -> (B, 1, d_model); TN: (B, d_model) -> (1, B, d_model) step_hidden_states = np.expand_dims(step_hidden_states, axis=self.model._time_axis) step_hidden_states, present_key_values = self.model.decoder.incremental_decode( step_hidden_states, position, past_key_values, mem_states, mem_valid_length) step_hidden_states = self.output_layer(step_hidden_states) # NT: (B, 1, vocab_size) -> (B, vocab_size); TN: (1, B, vocab_size) -> (B, vocab_size) step_hidden_states = npx.reshape(step_hidden_states, (-5, -1)) return step_hidden_states, (mem_states, mem_valid_length, position + 1, present_key_values)
def decode_step(self, step_input: np.ndarray, states: List, vocab_slice_ids: Optional[np.ndarray] = None): logits, states, target_factor_outputs = self._model.decode_step(step_input, states, vocab_slice_ids) if not self._skip_softmax: logits = npx.log_softmax(logits, axis=-1, temperature=self._softmax_temperature) scores = -logits target_factors = None # type: Optional[np.ndarray] if target_factor_outputs: # target factors are greedily 'decoded'. factor_predictions = [npx.cast(np.expand_dims(np.argmax(tfo, axis=1), axis=1), dtype='int32') for tfo in target_factor_outputs] target_factors = factor_predictions[0] if len(factor_predictions) == 1 \ else np.concatenate(factor_predictions, axis=1) return scores, states, target_factors
def test_create_target_and_shifted_label_sequences(): pytest.importorskip('mxnet') from sockeye import data_io from mxnet import np target_and_label = np.array([[C.BOS_ID, 4, 17, 35, 12, C.EOS_ID, C.PAD_ID, C.PAD_ID], [C.BOS_ID, 15, 23, 23, 77, 55, 22, C.EOS_ID], [C.BOS_ID, 4, C.EOS_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID]]) target_and_label = np.expand_dims(target_and_label, axis=2) expected_lengths = np.array([5, 7, 2]) target, label = data_io.create_target_and_shifted_label_sequences(target_and_label) assert target.shape[0] == label.shape[0] == target_and_label.shape[0] assert target.shape[1] == label.shape[1] == target_and_label.shape[1] - 1 lengths = (target != C.PAD_ID).sum(axis=1).squeeze() assert np.allclose(lengths, expected_lengths)
def get_initial_embedding(self, inputs, token_types=None): """Get the initial token embeddings that considers the token type and positional embeddings Parameters ---------- inputs - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) token_types The type of tokens. If None, it will be initialized as all zero. - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) Returns ------- embedding The initial embedding that will be fed into the encoder - layout = 'NT' Shape (batch_size, seq_length, C_embed) - layout = 'TN' Shape (seq_length, batch_size, C_embed) """ if self.layout == 'NT': time_axis, batch_axis = 1, 0 else: time_axis, batch_axis = 0, 1 embedding = self.word_embed(inputs) if token_types is None: token_types = np.zeros_like(inputs) type_embedding = self.token_type_embed(token_types) embedding = embedding + type_embedding if self.pos_embed_type is not None: positional_embedding = self.token_pos_embed(npx.arange_like(inputs, axis=time_axis)) positional_embedding = np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding # Extra layer normalization plus dropout embedding = self.embed_layer_norm(embedding) embedding = self.embed_dropout(embedding) return embedding