Exemplo n.º 1
0
  def batch_size(self):
    """Return length.

    Returns:
      length of sequence
    """
    if self.expr_list: return tt.batch_size(self.expr_list[0])
    elif self.expr_tensor is not None: return tt.batch_size(self.expr_tensor)
    else: return tt.batch_size(self.expr_transposed_tensor)
Exemplo n.º 2
0
    def add_input_to_prev(self, prev_state: UniLSTMState, x: Union[tt.Tensor, Sequence[tt.Tensor]]) \
            -> Tuple[Sequence[tt.Tensor]]:
        if isinstance(x, dy.Expression):
            x = [x]
        elif type(x) != list:
            x = list(x)

        if self.dropout_rate > 0.0 and self.train and self.dropout_mask_x is None:
            self.set_dropout_masks(batch_size=tt.batch_size(x[0]))

        new_c, new_h = [], []
        for layer_i in range(self.num_layers):
            if self.dropout_rate > 0.0 and self.train:
                # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                gates = dy.vanilla_lstm_gates_dropout_concat(
                    x, prev_state._h[layer_i], self.Wx[layer_i],
                    self.Wh[layer_i], self.b[layer_i],
                    self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i],
                    self.weightnoise_std if self.train else 0.0)
            else:
                gates = dy.vanilla_lstm_gates_concat(
                    x, prev_state._h[layer_i], self.Wx[layer_i],
                    self.Wh[layer_i], self.b[layer_i],
                    self.weightnoise_std if self.train else 0.0)
            new_c.append(dy.vanilla_lstm_c(prev_state._c[layer_i], gates))
            new_h.append(dy.vanilla_lstm_h(new_c[-1], gates))
            x = [new_h[-1]]

        return new_c, new_h
Exemplo n.º 3
0
    def calc_nll(self, src: Union[batchers.Batch, sent.Sentence], trg: Union[batchers.Batch, sent.Sentence]) \
            -> tt.Tensor:
        if not batchers.is_batched(src):
            src = batchers.ListBatch([src])

        src_inputs = batchers.ListBatch(
            [s[:-1] for s in src],
            mask=batchers.Mask(src.mask.np_arr[:, :-1]) if src.mask else None)
        src_targets = batchers.ListBatch(
            [s[1:] for s in src],
            mask=batchers.Mask(src.mask.np_arr[:, 1:]) if src.mask else None)

        event_trigger.start_sent(src)
        embeddings = self.src_embedder.embed_sent(src_inputs)
        encodings = self.rnn.transduce(embeddings)
        encodings_tensor = encodings.as_tensor()

        encoding_reshaped = tt.merge_time_batch_dims(encodings_tensor)
        seq_len = tt.sent_len(encodings_tensor)
        batch_size = tt.batch_size(encodings_tensor)

        outputs = self.transform.transform(encoding_reshaped)

        ref_action = np.asarray([sent.words for sent in src_targets]).reshape(
            (seq_len * batch_size, ))
        loss_expr_perstep = self.scorer.calc_loss(
            outputs, batchers.mark_as_batch(ref_action))

        loss_expr_perstep = tt.unmerge_time_batch_dims(loss_expr_perstep,
                                                       batch_size)

        loss = tt.aggregate_masked_loss(loss_expr_perstep, src_targets.mask)

        return loss
Exemplo n.º 4
0
 def _combine_batches(self, batched_expr, comb_method: str = "sum"):
     if comb_method == "sum":
         return dy.sum_batches(batched_expr)
     elif comb_method == "avg":
         return dy.sum_batches(batched_expr) * (1.0 /
                                                tt.batch_size(batched_expr))
     else:
         raise ValueError(
             f"Unknown batch combination method '{comb_method}', expected 'sum' or 'avg'.'"
         )
Exemplo n.º 5
0
 def _encode_src(self, src: Union[sent.Sentence, batchers.Batch]) -> tuple:
     event_trigger.start_sent(src)
     embeddings = self.src_embedder.embed_sent(src)
     encodings = self.encoder.transduce(embeddings)
     encodings_tensor = encodings.as_tensor()
     encoding_reshaped = tt.merge_time_batch_dims(encodings_tensor)
     outputs = self.transform.transform(encoding_reshaped)
     return tt.batch_size(
         encodings_tensor), encodings, outputs, tt.sent_len(
             encodings_tensor)
Exemplo n.º 6
0
    def transduce(
        self, expr_seq: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
        if isinstance(expr_seq, expression_seqs.ExpressionSequence):
            expr_seq = [expr_seq]
        concat_inputs = len(expr_seq) >= 2
        batch_size = tt.batch_size(expr_seq[0][0])
        seq_len = expr_seq[0].sent_len()
        mask = expr_seq[0].mask

        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        cur_input = expr_seq
        self._final_states = []
        for layer_i in range(self.num_layers):
            h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            for pos_i in range(seq_len):
                if concat_inputs and layer_i == 0:
                    x_t = tt.concatenate(
                        [cur_input[i][pos_i] for i in range(len(cur_input))])
                else:
                    x_t = cur_input[0][pos_i]
                h_tm1 = h[-1]
                if self.dropout_rate > 0.0 and self.train:
                    # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                    x_t = torch.mul(x_t, self.dropout_mask_x[layer_i])
                    h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i])
                h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1]))
                if mask is None or np.isclose(
                        np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0):
                    c.append(c_t)
                    h.append(h_t)
                else:
                    c.append(
                        mask.cmult_by_timestep_expr(c_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(c[-1], pos_i, False))
                    h.append(
                        mask.cmult_by_timestep_expr(h_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(h[-1], pos_i, False))
            self._final_states.append(
                transducers.FinalTransducerState(h[-1], c[-1]))
            cur_input = [h[1:]]

        return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
Exemplo n.º 7
0
    def transduce(
        self, es: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':

        batch_size = tt.batch_size(es.as_tensor())
        if es.mask:
            seq_lengths = es.mask.seq_lengths()
        else:
            seq_lengths = [es.sent_len()] * batch_size

        # Sort the input and lengths as the descending order
        seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device)
        lengths, perm_index = seq_lengths.sort(0, descending=True)
        sorted_input = es.as_tensor()[perm_index]

        perm_index_rev = [-1] * len(lengths)
        for i in range(len(lengths)):
            perm_index_rev[perm_index[i]] = i
        perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device)

        packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input,
                                                         list(lengths.data),
                                                         batch_first=True)
        state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir
        h0 = sorted_input.new_zeros(*state_size)
        c0 = sorted_input.new_zeros(*state_size)
        output, (final_hiddens,
                 final_cells) = self.lstm(packed_input, (h0, c0))
        output = nn.utils.rnn.pad_packed_sequence(
            output, batch_first=True, total_length=es.sent_len())[0]

        # restore the sorting
        decoded = output[perm_index_rev]

        self._final_states = []
        for layer_i in range(self.num_layers):
            final_hidden = final_hiddens.view(
                self.num_layers, self.num_dir, batch_size,
                -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1)
            final_hidden = final_hidden[perm_index_rev]
            self._final_states.append(
                transducers.FinalTransducerState(final_hidden))

        ret = expression_seqs.ExpressionSequence(expr_tensor=decoded,
                                                 mask=es.mask)
        return ret
Exemplo n.º 8
0
    def transduce(
        self, seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:

        if self.train and self.dropout > 0.0:
            seq_tensor = tt.dropout(
                self.child.transduce(seq).as_tensor(),
                self.dropout) + seq.as_tensor()
        else:
            seq_tensor = self.child.transduce(
                seq).as_tensor() + seq.as_tensor()
        if self.layer_norm:
            batch_size = tt.batch_size(seq_tensor)
            merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor)
            transformed_seq_tensor = self.layer_norm_component.transform(
                merged_seq_tensor)
            seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor,
                                                    batch_size)
        return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
Exemplo n.º 9
0
    def add_input_to_prev(self, prev_state: UniLSTMState, x: tt.Tensor) \
            -> Tuple[Sequence[tt.Tensor]]:
        assert isinstance(x, tt.Tensor)

        if self.dropout_rate > 0.0 and self.train and self.dropout_mask_x is None:
            self.set_dropout_masks(batch_size=tt.batch_size(x))

        new_c, new_h = [], []
        for layer_i in range(self.num_layers):
            h_tm1 = prev_state._h[layer_i]
            if self.dropout_rate > 0.0 and self.train:
                # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                x = torch.mul(x, self.dropout_mask_x[layer_i])
                h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i])
            h_t, c_t = self.layers[layer_i](x, (h_tm1, prev_state._c[layer_i]))
            new_c.append(c_t)
            new_h.append(h_t)
            x = h_t

        return new_c, new_h
Exemplo n.º 10
0
    def initial_state(self, enc_final_states: Any,
                      ss: Any) -> AutoRegressiveDecoderState:
        """Get the initial state of the decoder given the encoder final states.

    Args:
      enc_final_states: The encoder final states. Usually but not necessarily an :class:`xnmt.expression_sequence.ExpressionSequence`
      ss: first input
    Returns:
      initial decoder state
    """
        rnn_state = self.rnn.initial_state()
        rnn_s = self.bridge.decoder_init(enc_final_states)
        rnn_state = rnn_state.set_s(rnn_s)
        ss_expr = self.embedder.embed(ss)
        zeros = tt.zeroes(
            hidden_dim=self.input_dim,
            batch_size=tt.batch_size(ss_expr)) if self.input_feeding else None
        rnn_state = rnn_state.add_input(
            tt.concatenate([ss_expr, zeros]) if self.input_feeding else ss_expr
        )
        return AutoRegressiveDecoderState(rnn_state=rnn_state, context=zeros)
Exemplo n.º 11
0
 def transduce(
     self, src: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     sent_len = src.sent_len()
     batch_size = tt.batch_size(src[0])
     embeddings = self.embeddings(
         torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device))
     # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
     if self.op == 'sum':
         output = embeddings + src.as_tensor()
     elif self.op == 'concat':
         output = tt.concatenate([embeddings, src.as_tensor()])
     else:
         raise ValueError(
             f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")'
         )
     if self.train and self.dropout > 0.0:
         output = tt.dropout(output, self.dropout)
     output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                     mask=src.mask)
     self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
     return output_seq
Exemplo n.º 12
0
 def decoder_init(
     self, enc_final_states: Sequence[transducers.FinalTransducerState]
 ) -> List[tt.Tensor]:
     batch_size = tt.batch_size(enc_final_states[0].main_expr())
     z = tt.zeroes(hidden_dim=self.dec_dim, batch_size=batch_size)
     return [z] * (self.dec_layers * 2)
Exemplo n.º 13
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        Wq, Wk, Wv, Wo = [
            dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)
        ]
        bq, bk, bv, bo = [
            dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)
        ]

        # Start with a [(length, model_size) x batch] tensor
        x = expr_seq.as_transposed_tensor()
        x_len = tt.sent_len_transp(x)
        x_batch = tt.batch_size(x)
        # Get the query key and value vectors
        # TODO: do we need bias broadcasting in DyNet?
        # q = dy.affine_transform([bq, x, Wq])
        # k = dy.affine_transform([bk, x, Wk])
        # v = dy.affine_transform([bv, x, Wv])
        q = bq + x * Wq
        k = bk + x * Wk
        v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            dy.reshape(x, (x_len, self.head_dim),
                       batch_size=x_batch * self.num_heads) for x in (q, k, v)
        ]

        # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
        attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = dy.inputTensor(np.repeat(
                expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(),
                                  batched=True) * -1e10
            attn_score = attn_score + mask
        attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = dy.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = dy.reshape(attn_prob * v, (x_len, self.input_dim),
                       batch_size=x_batch)
        # Final transformation
        # o = dy.affine_transform([bo, attn_prob * v, Wo])
        o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq
Exemplo n.º 14
0
    def transduce(
        self, embed_sent: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        src = embed_sent.as_tensor()

        sent_len = tt.sent_len(src)
        batch_size = tt.batch_size(src)
        pad_size = (self.window_receptor -
                    1) / 2  #TODO adapt it also for even window size

        src = dy.concatenate([
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src,
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size)
        ],
                             d=1)
        padded_sent_len = sent_len + 2 * pad_size

        conv1 = dy.parameter(self.pConv1)
        bias1 = dy.parameter(self.pBias1)
        src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1),
                             batch_size=batch_size)
        cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1])

        hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1),
                                  batch_size=batch_size)
        if self.non_linearity is 'linear':
            hidden_layer = hidden_layer
        elif self.non_linearity is 'tanh':
            hidden_layer = dy.tanh(hidden_layer)
        elif self.non_linearity is 'relu':
            hidden_layer = dy.rectify(hidden_layer)
        elif self.non_linearity is 'sigmoid':
            hidden_layer = dy.logistic(hidden_layer)

        for conv_hid, bias_hid in self.builder_layers:
            hidden_layer = dy.conv2d_bias(hidden_layer,
                                          dy.parameter(conv_hid),
                                          dy.parameter(bias_hid),
                                          stride=[1, 1])
            hidden_layer = dy.reshape(hidden_layer,
                                      (self.internal_dim, sent_len, 1),
                                      batch_size=batch_size)
            if self.non_linearity is 'linear':
                hidden_layer = hidden_layer
            elif self.non_linearity is 'tanh':
                hidden_layer = dy.tanh(hidden_layer)
            elif self.non_linearity is 'relu':
                hidden_layer = dy.rectify(hidden_layer)
            elif self.non_linearity is 'sigmoid':
                hidden_layer = dy.logistic(hidden_layer)
        last_conv = dy.parameter(self.last_conv)
        last_bias = dy.parameter(self.last_bias)
        output = dy.conv2d_bias(hidden_layer,
                                last_conv,
                                last_bias,
                                stride=[1, 1])
        output = dy.reshape(output, (sent_len, self.output_dim),
                            batch_size=batch_size)
        output_seq = expression_seqs.ExpressionSequence(expr_tensor=output)
        self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
        return output_seq