示例#1
0
    def forward(self, best_hyp_indices, best_word_indices,
                finished, scores_accumulated, lengths, reference_lengths,
                factors=None):

        # Reorder fixed-size beam data according to best_hyp_indices (ascending)
        finished = np.take(finished, best_hyp_indices, axis=0)
        lengths = np.take(lengths, best_hyp_indices, axis=0)
        reference_lengths = np.take(reference_lengths, best_hyp_indices, axis=0)

        # Normalize hypotheses that JUST finished
        all_finished = np.expand_dims(np.logical_or(best_word_indices == self.pad_id,
                                                    best_word_indices == self.eos_id),
                                      axis=1)
        newly_finished = np.logical_xor(all_finished, finished)

        scores_accumulated = np.where(newly_finished,
                                      self._scorer(scores_accumulated,
                                                   npx.cast(lengths, self.dtype),
                                                   reference_lengths),
                                      scores_accumulated)

        # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos>
        finished = np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
        finished = npx.cast(np.expand_dims(finished, axis=1), 'int32')

        # Concatenate sorted secondary target factors to best_word_indices. Shape: (batch*beam, num_factors)
        best_word_indices = np.expand_dims(best_word_indices, axis=1)

        if factors is not None:
            secondary_factors = np.take(factors, best_hyp_indices, axis=0)
            best_word_indices = np.concatenate((best_word_indices, secondary_factors), axis=1)

        return best_word_indices, finished, scores_accumulated, lengths, reference_lengths
示例#2
0
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """Predict for sequence to sequence."""
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [
        src_vocab['<eos>']]
    enc_valid_len = np.array([len(src_tokens)], ctx=device)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    # Add the batch axis
    enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    # Add the batch axis
    dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(axis=2)
        pred = dec_X.squeeze(axis=0).astype('int32').item()
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        # Once the end-of-sequence token is predicted, the generation of the
        # output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq
示例#3
0
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device):
    """Predict sequences (defined in Chapter 9)."""
    #src_tokens = src_vocab[src_sentence.lower().split(' ')] + [src_vocab['<eos>']]
    src_tokens = src_vocab[get_word_list(
        src_sentence.lower())] + [src_vocab['<eos>']]
    enc_valid_len = np.array([len(src_tokens)], ctx=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    # Add the batch axis
    enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0)
    enc_outputs = model.encoder(enc_X, enc_valid_len)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_len)
    # Add the batch axis
    dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0)
    output_seq = []
    for _ in range(num_steps):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(axis=2)
        pred = dec_X.squeeze(axis=0).astype('int32').item()
        # Once the end-of-sequence token is predicted, the generation of
        # the output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq))
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, ctx):
    # ORG src_tokens = src_vocab[src_sentence.lower().split(' ')] # TODO : if vocab doesn't contain, instead of 0, assign other index
    src_tokens = src_vocab[src_sentence.lower().split(',')]
    num_steps = len(src_tokens) # Fix
    enc_valid_len = np.array([len(src_tokens)], ctx=ctx)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    enc_X = np.array(src_tokens, ctx=ctx)
    # Add the batch_size dimension
    enc_outputs = model.encoder(np.expand_dims(enc_X, axis=0), enc_valid_len)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_len)
    dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=ctx), axis=0)
    predict_tokens = []
    for _ in range(num_steps):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next timestep input
        dec_X = Y.argmax(axis=2)
        py = dec_X.squeeze(axis=0).astype('int32').item()
        #print("debug : ", py)
        if py == tgt_vocab['<eos>']:
            print('py : ', py)  #ToDo
            break
        elif py == tgt_vocab['<unk>']:
            print('py : ', py)  #ToDo
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))
示例#5
0
    def forward(self, positions):
        """

        Parameters
        ----------
        positions : NDArray
            Shape (..., )

        Returns
        -------
        ret :
            Shape (..., units)
        """
        emb = np.expand_dims(positions.astype(self._dtype),
                             axis=-1) * self.base_mult.data()
        sin_emb = np.sin(emb)
        cos_emb = np.cos(emb)
        if self._units % 2 == 0:
            return np.concatenate([sin_emb, cos_emb], axis=-1)
        else:
            return np.concatenate([
                sin_emb, cos_emb,
                np.expand_dims(np.zeros_like(positions).astype(self._dtype),
                               axis=-1)
            ],
                                  axis=-1)
示例#6
0
    def get_end_logits(self, contextual_embedding, start_positions, p_mask):
        """

        Parameters
        ----------
        contextual_embedding
            Shape (batch_size, sequence_length, C)
        start_positions
            Shape (batch_size, N)
            We process multiple candidates simultaneously
        p_mask
            Shape (batch_size, sequence_length)

        Returns
        -------
        end_logits
            Shape (batch_size, N, sequence_length)
        """
        # Select the features at the start_positions
        # start_feature will have shape (batch_size, N, C)
        start_features = select_vectors_by_position(contextual_embedding, start_positions)
        # Concatenate the start_feature and the contextual_embedding
        contextual_embedding = np.expand_dims(contextual_embedding, axis=1)  # (B, 1, T, C)
        start_features = np.expand_dims(start_features, axis=2)  # (B, N, 1, C)
        concat_features = np.concatenate([npx.broadcast_like(start_features,
                                                                 contextual_embedding, 2, 2),
                                            npx.broadcast_like(contextual_embedding,
                                                                 start_features, 1, 1)],
                                           axis=-1)  # (B, N, T, 2C)
        end_scores = self.end_scores(concat_features)
        end_scores = np.squeeze(end_scores, -1)
        end_logits = masked_logsoftmax(end_scores, mask=np.expand_dims(p_mask, axis=1),
                                       axis=-1)
        return end_logits
示例#7
0
def get_bert_encoding(net, tokens_a, tokens_b=None):
    tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b)
    ctx = d2l.try_gpu()
    token_ids = np.expand_dims(np.array(vocab[tokens], ctx=ctx), axis=0)
    segments = np.expand_dims(np.array(segments, ctx=ctx), axis=0)
    valid_len = np.expand_dims(np.array(len(tokens), ctx=ctx), axis=0)
    encoded_X, _, _ = net(token_ids, segments, valid_len)
    return encoded_X
示例#8
0
文件: t5.py 项目: RickyLLu/gluon-nlp
 def _get_relative_position(self, hidden_states):
     query_position = np.expand_dims(npx.arange_like(hidden_states,
                                                     axis=self.time_axis),
                                     axis=-1)
     mem_position = np.expand_dims(npx.arange_like(hidden_states,
                                                   axis=self.time_axis),
                                   axis=0)
     relative_position = mem_position - query_position
     return relative_position.astype(np.int32)
示例#9
0
def test_expand_dims():
    inp = np.zeros((INT_OVERFLOW))
    inp[-1] = 1
    out1 = np.expand_dims(inp, axis=0)
    out2 = np.expand_dims(out1, axis=2)
    assert out1.shape == (1, INT_OVERFLOW)
    assert out2.shape == (1, INT_OVERFLOW, 1)
    assert out1[0, -1] == 1
    assert out2[0, -1, 0] == 1
示例#10
0
 def forward(self, x: np.ndarray) -> np.ndarray:
     # Shape: (length, 1)
     length_array = npx.arange_like(x, axis=1)
     # matrix with lower triangle and main diagonal set to 0, upper triangle set to 1
     # Shape: (length, length)
     bias = npx.broadcast_greater(np.expand_dims(length_array, axis=0),
                                  np.expand_dims(length_array, axis=1))
     bias = bias * -C.LARGE_VALUES[self._dtype]
     bias = np.expand_dims(bias, axis=0)
     return npx.stop_gradient(bias)
示例#11
0
    def init_state_from_encoder(
            self,
            encoder_outputs: np.ndarray,
            encoder_valid_length: Optional[np.ndarray] = None,
            target_embed: Optional[np.ndarray] = None) -> List[np.ndarray]:
        """
        Returns the initial states given encoder output. States for teacher-forced training are encoder outputs
        and a valid length mask for encoder outputs.
        At inference, this method returns the following state tuple:
        valid length bias, step state,
        [projected encoder attention keys, projected encoder attention values] * num_layers,
        [autoregressive state dummies] * num_layers.

        :param encoder_outputs: Encoder outputs. Shape: (batch, source_length, encoder_dim).
        :param encoder_valid_length: Valid lengths of encoder outputs. Shape: (batch,).
        :param target_embed: Target-side embedding layer output. Shape: (batch, target_length, target_embedding_dim).
        :return: Initial states.
        """
        if target_embed is None:  # Inference: initial step = 0. Shape: (batch_size, 1)
            steps = np.expand_dims(np.zeros_like(encoder_valid_length), axis=1)
        else:  # Training: steps up to target length. Shape: (1, target_length)
            steps = np.expand_dims(npx.arange_like(target_embed, axis=1),
                                   axis=0)

        if self.inference_only:
            # Encoder projection caching, therefore we don't pass the encoder_outputs
            states = [steps, encoder_valid_length]

            for layer in self.layers:
                enc_att_kv = layer.enc_attention.ff_kv(encoder_outputs)
                states.append(np.transpose(enc_att_kv, axes=(1, 0, 2)))
        else:
            # NO encoder projection caching
            states = [
                steps,
                np.transpose(encoder_outputs, axes=(1, 0, 2)),
                encoder_valid_length
            ]

        _batch_size = encoder_outputs.shape[0]
        _ctx = encoder_outputs.ctx
        _dtype = encoder_outputs.dtype
        dummy_autoregr_states = [
            np.zeros(layer.get_states_shape(_batch_size),
                     ctx=_ctx,
                     dtype=_dtype) for layer in self.layers
            for _ in range(layer.num_state_tensors)
        ]

        states += dummy_autoregr_states
        return states
示例#12
0
    def decode_step(self,
                    step_input: np.ndarray,
                    states: List[np.ndarray],
                    vocab_slice_ids: Optional[np.ndarray] = None):
        outputs = []  # type: List[np.ndarray]
        new_states = []  # type: List[np.ndarray]
        factor_outputs = []  # type: List[List[np.ndarray]]
        state_index = 0
        for model, model_state_structure in zip(self._models, self.state_structure()):
            model_states = states[state_index:state_index+len(model_state_structure)]
            state_index += len(model_state_structure)
            logits, model_states, target_factor_outputs = model.decode_step(step_input, model_states, vocab_slice_ids)
            probs = npx.softmax(logits, axis=-1, temperature=self._softmax_temperature)
            outputs.append(probs)
            target_factor_probs = [npx.softmax(tfo, axis=-1) for tfo in target_factor_outputs]
            factor_outputs.append(target_factor_probs)
            new_states += model_states
        scores = self._interpolation(outputs)

        target_factors = None  # type: Optional[np.ndarray]
        if factor_outputs:
            # target factors are greedily 'decoded'.
            factor_predictions = [npx.cast(np.expand_dims(np.argmin(self._interpolation(fs), axis=-1), axis=1), dtype='int32')
                                  for fs in zip(*factor_outputs)]
            if factor_predictions:
                target_factors = factor_predictions[0] if len(factor_predictions) == 1 \
                    else np.concatenate(factor_predictions, axis=1)
        return scores, new_states, target_factors
示例#13
0
    def _training_cell_state_transform(
            previous_cell_state, weighted_inputs,
            forget_rates) -> Tuple[np.ndarray, np.ndarray]:
        """Update SSRU cell at training time"""
        def _time_step_update(
                step_input_and_forget_rate,
                previous_step_state) -> Tuple[np.ndarray, np.ndarray]:
            """
            Recurrently update the SSRU cell state for one time step.

            :param step_input_and_forget_rate: List = [step_input, forget_rate]
            :param previous_step_state: cell state at (t-1)
            :return: twice the current time step state. NOTE: The first instance will be stacked in the final
            foreach output and the second will be the input to the next time_step_update iteration.
            """
            step_input, forget_rate = step_input_and_forget_rate  # each of shape (batch_size, model_size)
            current_step_state = forget_rate * previous_step_state + step_input
            return current_step_state, current_step_state

        # (max_length, batch, input_depth), (batch, input_depth)
        cell_state, last_step_state = npx.foreach(
            _time_step_update, [weighted_inputs, forget_rates],
            np.squeeze(previous_cell_state, axis=0))

        return cell_state, np.expand_dims(last_step_state, axis=0)
示例#14
0
    def get_answerable_logits(self, contextual_embedding, p_mask):
        """Get the answerable logits.

        Parameters
        ----------
        contextual_embedding
            Shape (batch_size, sequence_length, C)
        p_mask
            Shape (batch_size, sequence_length)
            Mask the sequence.
            0 --> Denote that the element is masked,
            1 --> Denote that the element is not masked

        Returns
        -------
        answerable_logits
            Shape (batch_size, 2)
        """
        # Shape (batch_size, sequence_length)
        start_scores = np.squeeze(self.start_scores(contextual_embedding), -1)
        start_score_weights = masked_softmax(start_scores, p_mask, axis=-1)
        start_agg_feature = npx.batch_dot(np.expand_dims(start_score_weights, axis=1),
                                          contextual_embedding)
        start_agg_feature = np.squeeze(start_agg_feature, 1)
        cls_feature = contextual_embedding[:, 0, :]
        answerable_scores = self.answerable_scores(np.concatenate([start_agg_feature,
                                                                  cls_feature], axis=-1))
        answerable_logits = npx.log_softmax(answerable_scores, axis=-1)
        return answerable_logits
def multibox_target(anchors, labels):
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)
    batch_offset, batch_mask, batch_class_labels = [], [], []
    device, num_anchors = anchors.ctx, anchors.shape[0]
    print(labels.shape)
    print(batch_size)
    for i in range(batch_size):
        label = labels[i, :, :]
        anchors_bbox_map = match_anchor_to_bbox(label[:, 1:], anchors,
                                                device)  # [-1, 0, 1, -1 , 1]

        bbox_mask = np.tile((np.expand_dims((anchors_bbox_map >= 0), axis=-1)),
                            (1, 4)).astype('int32')
        # Initialize class_labels and assigned bbox coordinates with zeros
        class_labels = np.zeros(num_anchors, dtype=np.int32, ctx=device)
        assigned_bb = np.zeros((num_anchors, 4), dtype=np.float32, ctx=device)
        # Assign class labels to the anchor boxes using matched gt bbox labels
        # If no gt bbox is assigned to an anchor box, then let the
        # class_labels and assigned_bb remain zero, i.e the background class
        indices_true = np.nonzero(anchors_bbox_map >= 0)[0]  #[1,2,4]
        print(indices_true)
        bb_idx = anchors_bbox_map[indices_true]  #[0, 1, 1]
        class_labels[indices_true] = label[bb_idx, 0].astype(
            'int32') + 1  # Get category
        assigned_bb[indices_true] = label[bb_idx, 1:]  # Get ground-truth
        # offset transformations
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)
    bbox_offset = np.stack(batch_offset)
    bbox_mask = np.stack(batch_mask)
    class_labels = np.stack(batch_class_labels)
    return (bbox_offset, bbox_mask, class_labels)
示例#16
0
文件: t5.py 项目: RickyLLu/gluon-nlp
    def forward(self, hidden_states, valid_length, mem_states,
                mem_valid_length):
        # 1. relative position embeddings and attention masks
        position_embeddings = self.relative_position_encoder(
            self._get_relative_position(hidden_states))
        # relative position embedding is not used for cross attention,
        # so we just obtain the correct shape and fill it with 0
        mem_relative_position = np.zeros_like(
            self._get_relative_position(hidden_states, mem_states))
        mem_position_embeddings = np.repeat(np.expand_dims(
            mem_relative_position, axis=0),
                                            self._num_heads,
                                            axis=0)
        self_attn_mask = gen_self_attn_mask(hidden_states,
                                            valid_length,
                                            dtype=self._dtype,
                                            attn_type='causal',
                                            layout=self.layout)
        mem_attn_mask = gen_mem_attn_mask(mem_states,
                                          mem_valid_length,
                                          hidden_states,
                                          valid_length,
                                          dtype=self._dtype,
                                          layout=self.layout)

        # 2. decoder blocks and other layers
        hidden_states = self.dropout(hidden_states)
        for layer in self.layers:
            hidden_states = layer(hidden_states, self_attn_mask,
                                  position_embeddings, mem_states,
                                  mem_attn_mask, mem_position_embeddings)
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states
示例#17
0
    def forward(self, data, steps):  # pylint: disable=arguments-differ
        """
        Applies positional embeddings to input data.

        :param data: Input data. Shape: (batch, length or 1, num_embed)
        :param steps: Optional steps input. If given, shape is (batch_size or 1, seq_len,)

        :return: Data with positional embeddings added
        """
        # (length, num_embed)
        if steps is None:
            # (batch, length, num_embed)
            pos_embedding = npx.slice_like(np.expand_dims(self.weight.data(),
                                                          axis=0),
                                           data,
                                           axes=(1, ))
        else:
            # (batch_size or 1, seq_len, num_embed)
            pos_embedding = npx.embedding(steps, self.weight.data(),
                                          self.max_seq_len, self.num_embed)

        if self.weight_type == 'fixed':
            pos_embedding = npx.stop_gradient(pos_embedding)

        if self.scale_up_input:
            data = data * (self.num_embed**0.5)

        return data + pos_embedding
示例#18
0
    def forward(self, user_id, seq, item_id):
        item_embs = np.expand_dims(self.Q(seq), 1)
        user_emb = self.P(user_id)  # (4096, 10)
        out, out_h, out_v, out_hs = None, None, None, []
        # 横向卷积
        if self.d_prime:
            out_v = self.conv_v(item_embs)
            out_v = out_v.reshape(
                out_v.shape[0], self.fc1_dim_v)  # (4096, 4*10)
        # 纵向卷积 - 时间
        if self.d:
            for conv, maxp in zip(self.conv_h, self.max_pool):  # 滑动
                conv_out = np.squeeze(npx.relu(conv(item_embs)), axis=3)
                t = maxp(conv_out)
                pool_out = np.squeeze(t, axis=2)
                out_hs.append(pool_out)
            out_h = np.concatenate(out_hs, axis=1)  # (4096, 16*3)
        out = np.concatenate([out_v, out_h], axis=1)  # (4096, 4*10+16*3)
        z = self.fc(self.dropout(out))  # (4096, 10)

        # 和user_emb
        x = np.concatenate([z, user_emb], axis=1)  # (4096, 20)

        # 和item_emb计算
        q_prime_i = np.squeeze(self.Q_prime(item_id))  # (4096, 20)
        b = np.squeeze(self.b(item_id))
        res = (x * q_prime_i).sum(1) + b  # (4096,)
        return res
示例#19
0
    def forward(self, tokens, token_types, valid_length, p_mask, start_position):
        """

        Parameters
        ----------
        tokens
            Shape (batch_size, sequence_length)
        token_types
            Shape (batch_size, sequence_length)
        valid_length
            Shape (batch_size,)
        p_mask
            Shape (batch_size, sequence_length)
        start_position
            Shape (batch_size,)

        Returns
        -------
        start_logits
            Shape (batch_size, sequence_length)
        end_logits
            Shape (batch_size, sequence_length)
        answerable_logits
        """
        if self.use_segmentation:
            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
        else:
            contextual_embeddings = self.backbone(tokens, valid_length)
        start_logits = self.get_start_logits(contextual_embeddings, p_mask)
        end_logits = self.get_end_logits(contextual_embeddings,
                                         np.expand_dims(start_position, axis=1),
                                         p_mask)
        end_logits = np.squeeze(end_logits, axis=1)
        answerable_logits = self.get_answerable_logits(contextual_embeddings, p_mask)
        return start_logits, end_logits, answerable_logits
示例#20
0
def gen_rel_position(data, past_data=None, dtype=np.int32, layout='NT'): 
    """Create a matrix of relative position for RelAttentionScoreCell. 
    
    The relative position is defined as the index difference: `mem_i` - `query_j`. 
    Note, though, that the implementation here makes sense in self-attention's setting, 
    but not in cross-attention's. Hence, both `mem_i` and `query_j` are time indices from 
    `data` (or, in incremental decoding's case, the concatenated sequence from the current 
    stepwise `data` and the previous steps `past_data`). 

    Parameters
    ----------
    data
        The data. Under incremental decoding, seq_length = 1. 

        - layout = 'NT'
            Shape (batch_size, seq_length, C)
        - layout = 'TN'
            Shape (seq_length, batch_size, C)
    past_data
        This is only used under incremental decoding. Stacked data from previous steps. 
    dtype
        Data type of the mask
    layout
        Layout of the data + past_data

    Returns
    -------
    relative_position :
        Shape (query_length, mem_length) where query_length = mem_length = seq_length
    """
    time_axis = 1 if layout == 'NT' else 0
    if past_data is None: 
        position = npx.arange_like(data, axis=time_axis)
    else: 
        # for incremental decoding only, where past data is of the shape: 
        # NT(NTK): (B, L_seq, num_heads, n_kv) -> (B, L_seq, inner_dim)
        # TN(TNK): (L_seq, B, num_heads, n_kv) -> (L_seq, B, inner_dim)
        past_data = npx.reshape(past_data, (-2, -2, -5))
        position = npx.arange_like(
            np.concatenate([past_data, data], axis=time_axis), 
            axis=time_axis
        )
    query_position = np.expand_dims(position, axis=-1)
    mem_position = np.expand_dims(position, axis=0)
    relative_position = mem_position - query_position
    return relative_position.astype(np.int32) # shape (L_seq, L_seq)
示例#21
0
 def forward(self, queries, keys, values, valid_lens):
     queries, keys = self.W_q(queries), self.W_k(keys)
     # After dimension expansion, shape of `queries`: (`batch_size`, no. of
     # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1,
     # no. of key-value pairs, `num_hiddens`). Sum them up with
     # broadcasting
     features = np.expand_dims(queries, axis=2) + np.expand_dims(
         keys, axis=1)
     features = np.tanh(features)
     # There is only one output of `self.w_v`, so we remove the last
     # one-dimensional entry from the shape. Shape of `scores`:
     # (`batch_size`, no. of queries, no. of key-value pairs)
     scores = np.squeeze(self.w_v(features), axis=-1)
     self.attention_weights = masked_softmax(scores, valid_lens)
     # Shape of `values`: (`batch_size`, no. of key-value pairs, value
     # dimension)
     return npx.batch_dot(self.dropout(self.attention_weights), values)
 def forward(self, X, state):
     enc_outputs, hidden_state, enc_valid_len = state
     X = self.embedding(X).swapaxes(0, 1)
     outputs = []
     for x in X:
         # query shape: (batch_size, 1, num_hiddens)
         query = np.expand_dims(hidden_state[0][-1], axis=1)
         # context has same shape as query
         context = self.attention_cell(query, enc_outputs, enc_outputs,
                                       enc_valid_len)
         # Concatenate on the feature dimension
         x = np.concatenate((context, np.expand_dims(x, axis=1)), axis=-1)
         # Reshape x to (1, batch_size, embed_size + num_hiddens)
         out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state)
         outputs.append(out)
     outputs = self.dense(np.concatenate(outputs, axis=0))
     return outputs.swapaxes(0,
                             1), [enc_outputs, hidden_state, enc_valid_len]
示例#23
0
    def forward(self, x, layer_states):
        """

        Parameters
        ----------
        x
            - layout = 'NT'
                Shape (batch_size, seq_length, C_in)
            - layout = 'TN'
                Shape (seq_length, batch_size, C_in)

        layer_states
            - layout = 'NT'
                Shape (2, batch_size, prev_len, C_in)
            - layout = 'TN'
                Shape (2, prev_len, batch_size, C_in)
        """
        x = self.ln(x)
        if self._layout == 'NT':
            batch_axis, time_axis = 0, 1
            prev_len = npx.shape_array(layer_states)[2]
        else:
            batch_axis, time_axis = 1, 0
            prev_len = npx.shape_array(layer_states)[1]

        query, key, value = np.split(self.qkv(x), 3, axis=-1)
        if layer_states is not None:
            prev_key, prev_value = layer_states[0], layer_states[1]
            key = np.concatenate([prev_key, key], axis=time_axis)
            value = np.concatenate([prev_value, value], axis=time_axis)
        new_states = np.stack([key, value], axis=0)

        # gen mask
        query_pos = npx.arange_like(query, axis=time_axis)
        if prev_len is not None:
            query_pos = query_pos + prev_len
        key_pos = npx.arange_like(key, axis=time_axis)
        # (query_len, key_len)
        mask = (npx.reshape(key_pos,
                            (1, -1)) <= npx.reshape(query_pos,
                                                    (-1, 1))).astype(
                                                        self._dtype)
        # broadcast to (batch_size, query_len, key_len)
        mask = npx.broadcast_like(np.expand_dims(mask, axis=0),
                                  query,
                                  lhs_axes=0,
                                  rhs_axes=batch_axis)

        query = npx.reshape(query, (-2, -2, self._num_heads, -1))
        key = npx.reshape(key, (-2, -2, self._num_heads, -1))
        value = npx.reshape(value, (-2, -2, self._num_heads, -1))

        out, [_, attn_weight] = self.attention_cell(query, key, value, mask)
        out = self.out_proj(out)
        out = self.hidden_dropout(out)

        return out, new_states
示例#24
0
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, ctx):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    enc_valid_len = np.array([len(src_tokens)], ctx=ctx)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    enc_X = np.array(src_tokens, ctx=ctx)
    # Add the batch_size dimension
    enc_outputs = model.encoder(np.expand_dims(enc_X, axis=0), enc_valid_len)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_len)
    dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=ctx), axis=0)
    predict_tokens = []
    for _ in range(num_steps):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next timestep input
        dec_X = Y.argmax(axis=2)
        py = dec_X.squeeze(axis=0).astype('int32').item()
        if py == tgt_vocab['<eos>']:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))
示例#25
0
文件: t5.py 项目: RickyLLu/gluon-nlp
 def _get_relative_position(self,
                            hidden_states,
                            mem_states=None,
                            past_key_value=None):
     if past_key_value is None:
         query_position = np.expand_dims(npx.arange_like(
             hidden_states, axis=self.time_axis),
                                         axis=-1)
     else:
         # for incremental decoding only, where past key and past value are of shape
         # NT(NTK): (B, L_seq, num_heads, n_kv); TN(TNK): (L_seq, B, num_heads, n_kv)
         query_position = npx.arange_like(np.concatenate(
             [hidden_states, past_key_value[0]], axis=self.time_axis),
                                          axis=self.time_axis)
         query_position = np.expand_dims(query_position, axis=-1)
     mem_position = np.expand_dims(npx.arange_like(
         hidden_states if mem_states is None else mem_states,
         axis=self.time_axis),
                                   axis=0)
     relative_position = mem_position - query_position
     return relative_position.astype(np.int32)
def multibox_detection(cls_probs,
                       offset_preds,
                       anchors,
                       nms_threshold=0.5,
                       pos_threshold=0.00999999978):
    device, batch_size = cls_probs.ctx, cls_probs.shape[0]
    anchors = np.squeeze(anchors, axis=0)
    num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
    out = []
    # print(offset_preds)
    for i in range(batch_size):
        cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
        conf, class_id = np.max(cls_prob[1:], 0), np.argmax(cls_prob[1:], 0)
        predicted_bb = offset_inverse(anchors, offset_pred)
        keep = nms(predicted_bb, conf, 0.5)
        print(keep)
        # Find all non_keep indices and set the class_id to background
        all_idx = np.arange(num_anchors, dtype=np.int32, ctx=device)
        combined = np.concatenate((keep, all_idx))
        unique, counts = np.unique(combined, return_counts=True)
        print(unique, " . ", counts)
        non_keep = unique[counts == 1]
        all_id_sorted = np.concatenate((keep, non_keep))
        class_id[non_keep] = -1
        print(class_id)
        class_id = class_id[all_id_sorted].astype('float32')
        print(class_id)
        conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]
        print(conf)
        print(predicted_bb)
        # threshold to be a positive prediction
        below_min_idx = (conf < pos_threshold)
        class_id[below_min_idx] = -1
        conf[below_min_idx] = 1 - conf[below_min_idx]
        pred_info = np.concatenate((np.expand_dims(
            class_id, axis=1), np.expand_dims(conf, axis=1), predicted_bb),
                                   axis=1)
        out.append(pred_info)
    return np.stack(out)
示例#27
0
文件: t5.py 项目: yongyi-wu/gluon-nlp
 def forward(self, step_data, past_states):
     mem_states, mem_valid_length, position, past_key_values = past_states
     step_hidden_states = self.model.input_embedding_layer(step_data)
     # NT: (B, d_model) -> (B, 1, d_model); TN: (B, d_model) -> (1, B, d_model)
     step_hidden_states = np.expand_dims(step_hidden_states,
                                         axis=self.model._time_axis)
     step_hidden_states, present_key_values = self.model.decoder.incremental_decode(
         step_hidden_states, position, past_key_values, mem_states,
         mem_valid_length)
     step_hidden_states = self.output_layer(step_hidden_states)
     # NT: (B, 1, vocab_size) -> (B, vocab_size); TN: (1, B, vocab_size) -> (B, vocab_size)
     step_hidden_states = npx.reshape(step_hidden_states, (-5, -1))
     return step_hidden_states, (mem_states, mem_valid_length, position + 1,
                                 present_key_values)
示例#28
0
    def decode_step(self,
                    step_input: np.ndarray,
                    states: List,
                    vocab_slice_ids: Optional[np.ndarray] = None):
        logits, states, target_factor_outputs = self._model.decode_step(step_input, states, vocab_slice_ids)
        if not self._skip_softmax:
            logits = npx.log_softmax(logits, axis=-1, temperature=self._softmax_temperature)
        scores = -logits

        target_factors = None  # type: Optional[np.ndarray]
        if target_factor_outputs:
            # target factors are greedily 'decoded'.
            factor_predictions = [npx.cast(np.expand_dims(np.argmax(tfo, axis=1), axis=1), dtype='int32') for tfo in target_factor_outputs]
            target_factors = factor_predictions[0] if len(factor_predictions) == 1 \
                else np.concatenate(factor_predictions, axis=1)
        return scores, states, target_factors
示例#29
0
def test_create_target_and_shifted_label_sequences():
    pytest.importorskip('mxnet')
    from sockeye import data_io
    from mxnet import np
    target_and_label = np.array([[C.BOS_ID, 4, 17, 35, 12, C.EOS_ID, C.PAD_ID, C.PAD_ID],
                                 [C.BOS_ID, 15, 23, 23, 77, 55, 22, C.EOS_ID],
                                 [C.BOS_ID, 4, C.EOS_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID]])
    target_and_label = np.expand_dims(target_and_label, axis=2)
    expected_lengths = np.array([5, 7, 2])

    target, label = data_io.create_target_and_shifted_label_sequences(target_and_label)

    assert target.shape[0] == label.shape[0] == target_and_label.shape[0]
    assert target.shape[1] == label.shape[1] == target_and_label.shape[1] - 1
    lengths = (target != C.PAD_ID).sum(axis=1).squeeze()
    assert np.allclose(lengths, expected_lengths)
示例#30
0
    def get_initial_embedding(self, inputs, token_types=None):
        """Get the initial token embeddings that considers the token type and positional embeddings

        Parameters
        ----------
        inputs
            - layout = 'NT'
                Shape (batch_size, seq_length)
            - layout = 'TN'
                Shape (seq_length, batch_size)

        token_types
            The type of tokens. If None, it will be initialized as all zero.

            - layout = 'NT'
                Shape (batch_size, seq_length)
            - layout = 'TN'
                Shape (seq_length, batch_size)

        Returns
        -------
        embedding
            The initial embedding that will be fed into the encoder

            - layout = 'NT'
                Shape (batch_size, seq_length, C_embed)
            - layout = 'TN'
                Shape (seq_length, batch_size, C_embed)

        """
        if self.layout == 'NT':
            time_axis, batch_axis = 1, 0
        else:
            time_axis, batch_axis = 0, 1
        embedding = self.word_embed(inputs)
        if token_types is None:
            token_types = np.zeros_like(inputs)
        type_embedding = self.token_type_embed(token_types)
        embedding = embedding + type_embedding
        if self.pos_embed_type is not None:
            positional_embedding = self.token_pos_embed(npx.arange_like(inputs, axis=time_axis))
            positional_embedding = np.expand_dims(positional_embedding, axis=batch_axis)
            embedding = embedding + positional_embedding
        # Extra layer normalization plus dropout
        embedding = self.embed_layer_norm(embedding)
        embedding = self.embed_dropout(embedding)
        return embedding