Exemplo n.º 1
0
def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen):

    input_shape = L.shape(input_mask)
    input_batch = input_shape[0]
    input_seqlen = input_shape[1]
    num_slot = input_seqlen / slot_seqlen
    num_b = num_slot - 1
    ones = L.ones([num_b], dtype="float32")  # [num_b]
    diag_ones = L.diag(ones)  # [num_b, num_b]
    diag_ones = L.unsqueeze(diag_ones, [1, -1])  # [num_b, 1, num_b, 1]
    diag_ones = L.expand(
        diag_ones,
        [1, slot_seqlen, 1, slot_seqlen])  # [num_b, seqlen, num_b, seqlen]
    diag_ones = L.reshape(diag_ones,
                          [1, num_b * slot_seqlen, num_b * slot_seqlen
                           ])  # [1, num_b*seqlen, num_b*seqlen]

    graph_attn_bias = L.concat([
        L.ones([1, num_b * slot_seqlen, slot_seqlen], dtype="float32"),
        diag_ones
    ], 2)
    graph_attn_bias = L.concat([
        L.ones([1, slot_seqlen, num_slot * slot_seqlen], dtype="float32"),
        graph_attn_bias
    ], 1)  # [1, seq, seq]

    pad_attn_bias = L.matmul(input_mask, input_mask,
                             transpose_y=True)  # [batch, seq, seq]
    attn_bias = graph_attn_bias * pad_attn_bias

    attn_bias = (1. - attn_bias) * -10000.
    attn_bias = L.stack([attn_bias] * n_head, 1)  # [batch, n_head, seq, seq]
    if attn_bias.dtype != dtype:
        attn_bias = L.cast(attn_bias, dtype)
    return attn_bias
Exemplo n.º 2
0
def greedy_search_infilling(model,
                            q_ids,
                            q_sids,
                            sos_id,
                            eos_id,
                            attn_id,
                            max_encode_len=640,
                            max_decode_len=100,
                            tgt_type_id=3):
    model.eval()
    _, logits, info = model(q_ids, q_sids)
    gen_ids = L.argmax(logits, -1)
    d_batch, d_seqlen = q_ids.shape
    seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True)
    has_stopped = np.zeros([d_batch], dtype=np.bool)
    gen_seq_len = np.zeros([d_batch], dtype=np.int64)
    output_ids = []

    past_cache = info['caches']

    cls_ids = L.ones([d_batch], dtype='int64') * sos_id
    attn_ids = L.ones([d_batch], dtype='int64') * attn_id
    ids = L.stack([cls_ids, attn_ids], -1)
    for step in range(max_decode_len):
        bias = gen_bias(q_ids, ids, step)
        pos_ids = D.to_variable(
            np.tile(np.array([[step, step + 1]], dtype=np.int64),
                    [d_batch, 1]))
        pos_ids += seqlen
        _, logits, info = model(ids,
                                L.ones_like(ids) * tgt_type_id,
                                pos_ids=pos_ids,
                                attn_bias=bias,
                                past_cache=past_cache)
        gen_ids = L.argmax(logits, -1)

        past_cached_k, past_cached_v = past_cache
        cached_k, cached_v = info['caches']
        cached_k = [
            L.concat([pk, k[:, :1, :]], 1)
            for pk, k in zip(past_cached_k, cached_k)
        ]  # concat cached
        cached_v = [
            L.concat([pv, v[:, :1, :]], 1)
            for pv, v in zip(past_cached_v, cached_v)
        ]
        past_cache = (cached_k, cached_v)

        gen_ids = gen_ids[:, 1]
        ids = L.stack([gen_ids, attn_ids], 1)

        gen_ids = gen_ids.numpy()
        has_stopped |= (gen_ids == eos_id).astype(np.bool)
        gen_seq_len += (1 - has_stopped.astype(np.int64))
        output_ids.append(gen_ids.tolist())
        if has_stopped.all():
            break
    output_ids = np.array(output_ids).transpose([1, 0])
    return output_ids
Exemplo n.º 3
0
    def simple_net(self):
        d0 = layers.data(
            "d0", shape=[10], append_batch_size=False, dtype='float32')
        d1 = layers.data(
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
        # fill_constant npu op doesn't support int64
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)
        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)
        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        array_len = layers.fill_constant(shape=[1], dtype='int32', value=5)
        array_len = layers.cast(array_len, 'int64')
        array_len.stop_gradient = True
        cond = layers.ones(shape=[1], dtype='int32')
        cond = layers.cast(cond, 'bool')
        j = layers.fill_constant(shape=[1], dtype='int32', value=1)
        j = layers.cast(j, 'int64')
        j.stop_gradient = True
        array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3)
        array_len2 = layers.cast(array_len2, 'int64')
        array_len2.stop_gradient = True
        cond2 = layers.logical_or(x=j, y=array_len2)
        cond2 = layers.ones(shape=[1], dtype='int32')
        cond2 = layers.cast(cond2, 'bool')
        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)
        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)
        return loss, sum_result
Exemplo n.º 4
0
    def forward(self, q, k, v, lengths, speaker_embed, start_index, 
                force_monotonic=False, prev_coeffs=None, window=None):
        # add position encoding as an inductive bias 
        if self.has_bias: # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze(
                self.k_pos_affine(speaker_embed), axes=[-1]))
        else: # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
            forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients, 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
Exemplo n.º 5
0
    def forward(self, x, mask_in=None):
        assert len(x.shape) == 4

        if mask_in is not None or self.last_size != tuple(x.shape):
            self.last_size = tuple(x.shape)

            with dg.no_grad():
                if self.weight_maskUpdater.dtype != x.dtype:
                    self.weight_maskUpdater = self.weight_maskUpdater.astype(
                        x.dtype)

                if mask_in is None:
                    # If mask is not provided, create a mask.
                    if self.multi_channel:
                        mask = L.ones(x.shape, dtype=x.dtype)
                    else:
                        mask = L.ones((1, 1, x.shape[2], x.shape[3]),
                                      dtype=x.dtype)
                else:
                    mask = mask_in

                self.update_mask = nn.functional.conv2d(
                    mask,
                    self.weight_maskUpdater,
                    bias=None,
                    stride=self.stride,
                    padding=self.padding,
                    dilation=self.dilation,
                    groups=1)
                # For mixed precision training, eps from 1e-8 ~ 1e-6
                eps = 1e-6
                self.mask_ratio = self.slide_winsize / (self.update_mask + eps)
                self.update_mask = L.clamp(self.update_mask, 0, 1)
                self.mask_ratio = self.mask_ratio * self.update_mask

        raw_out = super(PartialConv2D,
                        self).forward(x * mask if mask_in is not None else x)

        if self.bias is not None:
            bias_view = L.reshape(self.bias, (1, self.out_channels, 1, 1))
            output = (raw_out - bias_view) * self.mask_ratio + bias_view
            output = output * self.update_mask
        else:
            output = raw_out * self.mask_ratio

        if self.return_mask:
            return output, self.update_mask
        else:
            return output
Exemplo n.º 6
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    _, vocab_size = logits.shape

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]
    #log.debug(gather_idx.numpy())
    #log.debug(state.finished.numpy())
    #log.debug(next_finished.numpy())

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    #log.debug(next_word_id.numpy())
    #log.debug(next_beam_id.numpy())
    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
Exemplo n.º 7
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    beam_size, vocab_size = logits.shape  # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size
    logits_np = logits.numpy()
    for i in range(beam_size):
        logits_np[i][17963] = 0  # make [UNK] prob = 0
    logits = D.to_variable(logits_np)

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
Exemplo n.º 8
0
    def __init__(self, *args, multi_channel=False, return_mask=True, **kwargs):
        # whether the mask is multi-channel or not
        self.multi_channel = multi_channel
        self.return_mask = return_mask
        super(PartialConv2D, self).__init__(*args, **kwargs)

        if self.multi_channel:
            self.weight_maskUpdater = L.ones(
                (self.out_channels, self.in_channels, self.kernel_size[0],
                 self.kernel_size[1]))
        else:
            self.weight_maskUpdater = L.ones(
                (1, 1, self.kernel_size[0], self.kernel_size[1]))

        shape = self.weight_maskUpdater.shape
        self.slide_winsize = shape[1] * shape[2] * shape[3]
        self.last_size = (None, None, None, None)
        self.update_mask = None
        self.mask_ratio = None
        self.partial_conv = True
Exemplo n.º 9
0
    def _build_sentence_ids(self, src_ids):
        src_shape = L.shape(src_ids)
        src_seqlen = src_shape[1]
        src_batch = src_shape[0]
        slot_seqlen = self.slot_seqlen

        zeros = L.zeros([src_batch, slot_seqlen], "int64")
        ones = L.ones([src_batch, src_seqlen - slot_seqlen], "int64")
        sentence_ids = L.concat([zeros, ones], 1)
        sentence_ids.stop_gradient = True
        return sentence_ids
Exemplo n.º 10
0
def partial_trace(rho_AB, dim1, dim2, A_or_B):
    r"""计算量子态的偏迹。

    Args:
        rho_AB (ComplexVariable): 输入的量子态
        dim1 (int): 系统A的维数
        dim2 (int): 系统B的维数
        A_or_B (int): 1或者2,1表示去除A,2表示去除B

    Returns:
        ComplexVariable: 量子态的偏迹

    """
    if A_or_B == 2:
        dim1, dim2 = dim2, dim1

    idty_np = identity(dim2).astype("complex128")
    idty_B = to_variable(idty_np)

    zero_np = np_zeros([dim2, dim2], "complex128")
    res = to_variable(zero_np)

    for dim_j in range(dim1):
        row_top = pp_zeros([1, dim_j], dtype="float64")
        row_mid = ones([1, 1], dtype="float64")
        row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float64")
        bra_j_re = concat([row_top, row_mid, row_bot], axis=1)
        bra_j_im = pp_zeros([1, dim1], dtype="float64")
        bra_j = ComplexVariable(bra_j_re, bra_j_im)

        if A_or_B == 1:
            row_tmp = pp_kron(bra_j, idty_B)
            res = elementwise_add(
                res,
                matmul(
                    matmul(row_tmp, rho_AB),
                    pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag),
                                 perm=[1, 0]),
                ),
            )

        if A_or_B == 2:
            row_tmp = pp_kron(idty_B, bra_j)
            res = elementwise_add(
                res,
                matmul(
                    matmul(row_tmp, rho_AB),
                    pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag),
                                 perm=[1, 0]),
                ),
            )

    return res
Exemplo n.º 11
0
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
    n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
        'num_attention_heads']
    head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32')
    head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32')
    head_mask.stop_gradient = False

    intermediate_weight = []
    intermediate_bias = []
    output_weight = []

    for name, w in model.named_parameters():
        if 'ffn.i' in name:
            if len(w.shape) > 1:
                intermediate_weight.append(w)
            else:
                intermediate_bias.append(w)

        if 'ffn.o' in name:
            if len(w.shape) > 1:
                output_weight.append(w)

    neuron_importance = []
    for w in intermediate_weight:
        neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32'))

    eval_task_names = ('mnli',
                       'mnli-mm') if args.task == 'mnli' else (args.task, )

    for eval_task in eval_task_names:
        for batch in dev_ds.start(place):
            ids, sids, label = batch
            out = model(ids,
                        sids,
                        labels=label,
                        head_mask=head_mask,
                        num_layers=model_cfg['num_hidden_layers'])
            loss = out[0]
            loss.backward()
            head_importance += L.abs(FD.to_variable(head_mask.gradient()))

            for w1, b1, w2, current_importance in zip(intermediate_weight,
                                                      intermediate_bias,
                                                      output_weight,
                                                      neuron_importance):
                current_importance += np.abs(
                    (np.sum(w1.numpy() * w1.gradient(), axis=0) +
                     b1.numpy() * b1.gradient()))
                current_importance += np.abs(
                    np.sum(w2.numpy() * w2.gradient(), axis=1))

    return head_importance, neuron_importance
Exemplo n.º 12
0
def gen_bias(encoder_inputs, decoder_inputs, step):
    decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2]
    attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1])
    decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.),
                          'float32')  #[1, 1, decoderlen, decoderlen]
    encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1])  #[bsz, 1, encoderlen]
    encoder_bias = L.expand(encoder_bias, [1, decoder_seqlen, 1])  #[bsz,decoderlen, encoderlen]
    decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1])  #[bsz, decoderlen, decoderlen]
    if step > 0:
        bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1)
    else:
        bias = L.concat([encoder_bias, decoder_bias], -1)
    return bias
Exemplo n.º 13
0
def partial_trace(rho_AB, dim1, dim2, A_or_B):
    r"""求AB复合系统下的偏迹

    Args:
        rho_AB (Variable): AB复合系统的密度矩阵
        dim1 (int): A系统的维度
        dim2 (int): B系统的维度
        A_orB (int): 1表示求系统A,2表示求系统B

    Returns:
        ComplexVariable: 求得的偏迹
    """

    # dim_total = dim1 * dim2
    if A_or_B == 2:
        dim1, dim2 = dim2, dim1

    idty_np = identity(dim2).astype("complex64")
    idty_B = to_variable(idty_np)

    zero_np = np_zeros([dim2, dim2], "complex64")
    res = to_variable(zero_np)

    for dim_j in range(dim1):
        row_top = pp_zeros([1, dim_j], dtype="float32")
        row_mid = ones([1, 1], dtype="float32")
        row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float32")
        bra_j_re = concat([row_top, row_mid, row_bot], axis=1)
        bra_j_im = pp_zeros([1, dim1], dtype="float32")
        bra_j = ComplexVariable(bra_j_re, bra_j_im)

        if A_or_B == 1:
            row_tmp = pp_kron(bra_j, idty_B)
            res = elementwise_add(
                res,
                matmul(
                    matmul(row_tmp, rho_AB),
                    pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag),
                                 perm=[1, 0]),
                ),
            )

        if A_or_B == 2:
            row_tmp = pp_kron(idty_B, bra_j)
            res += matmul(
                matmul(row_tmp, rho_AB),
                pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag),
                             perm=[1, 0]),
            )
    return res
Exemplo n.º 14
0
def get_dec_attn_key_pad_mask(seq_k, num_head, dtype):
    ''' For masking out the padding part of key sequence. '''

    # Expand to fit the shape of key query attention matrix.
    padding_mask = layers.cast(seq_k == 0, dtype=dtype)
    padding_mask = layers.unsqueeze(padding_mask, axes=[1])
    len_k = seq_k.shape[1]
    triu = layers.triu(layers.ones(shape=[len_k, len_k], dtype=dtype),
                       diagonal=1)
    padding_mask = padding_mask + triu
    padding_mask = layers.cast(padding_mask != 0,
                               dtype=dtype) * -1e30  #* (-2**32 + 1)
    padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
    return padding_mask
Exemplo n.º 15
0
def gradient_penalty(x, y, f):
    # interpolation
    shape = [x.shape[0]] + [1] * (x.dim() - 1)
    alpha = layers.rand(shape)
    z = x + alpha * (y - x)

    # gradient penalty
    z = dygraph.to_variable(z)
    o = f(z)
    g = dygraph.grad(o,
                     z,
                     grad_outputs=layers.ones(o.size()),
                     create_graph=True)[0].view(z.size(0), -1)
    gp = ((g.norm(p=2, dim=1) - 1)**2).mean()
    return gp
Exemplo n.º 16
0
    def __init__(self, x, y, y_aux, cfg):
        self.program = fluid.default_main_program().clone()
        with fluid.program_guard(self.program):
            model = ACGAN(cfg.latent_size, cfg.num_classes)
            self.fake, self.aux = model.network_d(x, name='d')

            self.fake_loss = layers.sigmoid_cross_entropy_with_logits(
                x=self.fake, label=y)
            self.aux_loss = layers.softmax_with_cross_entropy(logits=self.aux,
                                                              label=y_aux)
            self.unweighted_loss = layers.reduce_sum(self.fake_loss +
                                                     self.aux_loss)
            self.infer_program = self.program.clone(for_test=True)

            # we don't want the discriminator to also maximize the classification
            # accuracy of the auxiliary classifier on generated images, so we
            # don't train discriminator to produce class labels for generated
            # images (see https://openreview.net/forum?id=rJXTf9Bxg).
            # To preserve sum of sample weights for the auxiliary classifier,
            # we assign sample weight of 2 to the real images.

            fake_loss_weight = layers.ones(shape=[cfg.batch_size * 2, 1],
                                           dtype='float32')
            aux_loss_weight_zeros = layers.zeros(shape=[cfg.batch_size, 1],
                                                 dtype='float32')
            aux_loss_weight_twos = layers.fill_constant(
                shape=[cfg.batch_size, 1], value=2.0, dtype='float32')
            aux_loss_weight = layers.concat(
                [aux_loss_weight_twos, aux_loss_weight_zeros])

            self.fake_loss = layers.elementwise_mul(self.fake_loss,
                                                    fake_loss_weight)
            self.aux_loss = layers.elementwise_mul(self.aux_loss,
                                                   aux_loss_weight)

            self.loss = layers.reduce_sum(self.fake_loss) + layers.reduce_sum(
                self.aux_loss)

            vars = []
            for var in self.program.list_vars():
                if fluid.io.is_parameter(var) and (var.name.startswith("d")):
                    vars.append(var.name)
            optimizer = fluid.optimizer.Adam(learning_rate=cfg.adam_lr,
                                             beta1=cfg.adam_beta_1,
                                             name="net_d")
            optimizer.minimize(self.loss, parameter_list=vars)
Exemplo n.º 17
0
def partial_trace(rho_AB, dim1, dim2, A_or_B):
    """
    :param rho_AB: the input density matrix
    :param dim1: dimension for system A
    :param dim2: dimension for system B
    :param A_or_B: 1 or 2, choose the system that you want trace out.
    :return: partial trace
    """

    # dim_total = dim1 * dim2
    if A_or_B == 2:
        dim1, dim2 = dim2, dim1

    idty_np = identity(dim2).astype("complex64")
    idty_B = to_variable(idty_np)

    zero_np = np_zeros([dim2, dim2], "complex64")
    res = to_variable(zero_np)

    for dim_j in range(dim1):
        row_top = pp_zeros([1, dim_j], dtype="float32")
        row_mid = ones([1, 1], dtype="float32")
        row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float32")
        bra_j_re = concat([row_top, row_mid, row_bot], axis=1)
        bra_j_im = pp_zeros([1, dim1], dtype="float32")
        bra_j = ComplexVariable(bra_j_re, bra_j_im)

        if A_or_B == 1:
            row_tmp = pp_kron(bra_j, idty_B)
            res = elementwise_add(
                res,
                matmul(
                    matmul(row_tmp, rho_AB),
                    pp_transpose(
                        ComplexVariable(row_tmp.real, -row_tmp.imag),
                        perm=[1, 0]), ), )

        if A_or_B == 2:
            row_tmp = pp_kron(idty_B, bra_j)
            res += matmul(
                matmul(row_tmp, rho_AB),
                pp_transpose(
                    ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]),
            )
    return res
Exemplo n.º 18
0
 def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
     """
     Create the criterion.
     Parameters:
         num_classes: number of object categories, omitting the special on-object category
         matcher: module able to compute a matching between targets and proposals
         weight_dict: dict containing as key the names of the losses and as values their relative weight.
         eos_coef: relative classification weight applied to the no-object category
         losses: list of all the losses to be applied. See get_loss for list of available losses.
     """
     super().__init__()
     self.num_classes = num_classes
     self.matcher = matcher
     self.weight_dict = weight_dict
     self.eos_coef = eos_coef
     self.losses = losses
     self.eos_coef = eos_coef
     empty_weight = L.ones([self.num_classes + 1], dtype="float32")
     empty_weight[-1] = self.eos_coef
     self.empty_weight = empty_weight
Exemplo n.º 19
0
def topk_pool(gw, score, graph_id, ratio):
    """Implementation of topk pooling, where k means pooling ratio.
    
    Args:
        gw: Graph wrapper object.

        score: The attention score of all nodes, which is used to select 
               important nodes.

        graph_id: The graphs that the nodes belong to.

        ratio: The pooling ratio of nodes we want to select.

    Return: 
        perm: The index of nodes we choose.

        ratio_length: The selected node numbers of each graph.
    """

    graph_lod = gw.graph_lod
    graph_nodes = gw.num_nodes
    num_graph = gw.num_graph

    num_nodes = L.ones(shape=[graph_nodes], dtype="float32")
    num_nodes = L.lod_reset(num_nodes, graph_lod)
    num_nodes_per_graph = L.sequence_pool(num_nodes, pool_type='sum')
    max_num_nodes = L.reduce_max(num_nodes_per_graph, dim=0)
    max_num_nodes = L.cast(max_num_nodes, dtype="int32")

    index = L.arange(0, gw.num_nodes, dtype="int64")
    offset = L.gather(graph_lod, graph_id, overwrite=False)
    index = (index - offset) + (graph_id * max_num_nodes)
    index.stop_gradient = True

    # padding
    dense_score = L.fill_constant(shape=[num_graph * max_num_nodes],
                                  dtype="float32",
                                  value=-999999)
    index = L.reshape(index, shape=[-1])
    dense_score = L.scatter(dense_score, index, updates=score)
    num_graph = L.cast(num_graph, dtype="int32")
    dense_score = L.reshape(dense_score, shape=[num_graph, max_num_nodes])

    # record the sorted index
    _, sort_index = L.argsort(dense_score, axis=-1, descending=True)

    # recover the index range
    graph_lod = graph_lod[:-1]
    graph_lod = L.reshape(graph_lod, shape=[-1, 1])
    graph_lod = L.cast(graph_lod, dtype="int64")
    sort_index = L.elementwise_add(sort_index, graph_lod, axis=-1)
    sort_index = L.reshape(sort_index, shape=[-1, 1])

    # use sequence_slice to choose selected node index
    pad_lod = L.arange(0, (num_graph + 1) * max_num_nodes,
                       step=max_num_nodes,
                       dtype="int32")
    sort_index = L.lod_reset(sort_index, pad_lod)
    ratio_length = L.ceil(num_nodes_per_graph * ratio)
    ratio_length = L.cast(ratio_length, dtype="int64")
    ratio_length = L.reshape(ratio_length, shape=[-1, 1])
    offset = L.zeros(shape=[num_graph, 1], dtype="int64")
    choose_index = L.sequence_slice(input=sort_index,
                                    offset=offset,
                                    length=ratio_length)

    perm = L.reshape(choose_index, shape=[-1])
    return perm, ratio_length
Exemplo n.º 20
0
    def __call__(self, kernel_preds, cls_preds, mask_protos,
                 batch_gt_objs_tensors, batch_gt_clss_tensors,
                 batch_gt_masks_tensors, batch_gt_pos_idx_tensors):
        '''
        :param kernel_preds:  kernel_preds里每个元素形状是[N, 256, seg_num_grid, seg_num_grid],  每个格子的预测卷积核。      从 小感受野 到 大感受野。
        :param cls_preds:     cls_preds里每个元素形状是   [N,  80, seg_num_grid, seg_num_grid],  每个格子的预测概率,未进行sigmoid()激活。  从 小感受野 到 大感受野。
        :param mask_protos:   [bs, 256, s4, s4]   掩码原型
        :param batch_gt_objs_tensors:   里每个元素形状是[N, seg_num_grid, seg_num_grid, 1],   每个格子的objness。           从 小感受野 到 大感受野。
        :param batch_gt_clss_tensors:   里每个元素形状是[N, seg_num_grid, seg_num_grid, 80],  每个格子真实类别onehot。      从 小感受野 到 大感受野。
        :param batch_gt_masks_tensors:     里每个元素形状是[N, -1, s4, s4],   真实掩码。  从 小感受野 到 大感受野。
        :param batch_gt_pos_idx_tensors:   里每个元素形状是[N, -1, 3],    正样本的下标。  从 小感受野 到 大感受野。
        :return:
        '''

        batch_size = self.batch_size
        num_layers = len(kernel_preds)

        # ================= 计算损失 =================
        num_ins = 0.  # 记录这一批图片的正样本个数
        loss_clss, loss_masks = [], []
        for bid in range(batch_size):
            for lid in range(num_layers):
                # ================ 掩码损失 ======================
                mask_proto = mask_protos[bid]  # [256, s4, s4]   这张图片产生的掩码原型。
                kernel_pred = kernel_preds[lid][
                    bid]  # [256, seg_num_grid, seg_num_grid]   格子预测的卷积核(yolact中的“掩码系数”)
                kernel_pred = L.transpose(
                    kernel_pred, perm=[1, 2, 0]
                )  # [seg_num_grid, seg_num_grid, 256]   格子预测的卷积核(yolact中的“掩码系数”)

                gt_objs = batch_gt_objs_tensors[lid][
                    bid]  # [seg_num_grid, seg_num_grid, 1]
                gt_masks = batch_gt_masks_tensors[lid][bid]  # [-1, s4, s4]
                pmidx = batch_gt_pos_idx_tensors[lid][bid]  # [-1, 3]
                gt_objs.stop_gradient = True
                gt_masks.stop_gradient = True
                pmidx.stop_gradient = True

                idx_sum = L.reduce_sum(pmidx, dim=1)
                keep = L.where(idx_sum > -1)
                keep = L.reshape(keep, (-1, ))
                keep.stop_gradient = True
                pmidx = L.gather(pmidx, keep)  # [M, 3]

                yx_idx = pmidx[:, :2]  # [M, 2]
                m_idx = pmidx[:, 2]  # [M, ]
                yx_idx.stop_gradient = True
                m_idx.stop_gradient = True

                # 抽出来
                gt_obj = L.gather_nd(gt_objs,
                                     yx_idx)  # [M, 1]        是否是真正的正样本。
                pos_krn = L.gather_nd(kernel_pred,
                                      yx_idx)  # [M, 256]      正样本的卷积核(掩码系数)。
                gt_mask = L.gather(gt_masks, m_idx)  # [M, s4, s4]   真实掩码。

                # 正样本数量
                num_ins += L.reduce_sum(gt_obj)

                # 生成预测掩码
                mask_proto = L.transpose(mask_proto, perm=[1, 2,
                                                           0])  # [s4, s4, 256]
                masks = L.matmul(mask_proto, pos_krn,
                                 transpose_y=True)  # [s4, s4, M]
                masks = L.sigmoid(masks)  # [s4, s4, M]
                masks = L.transpose(masks, perm=[2, 0, 1])  # [M, s4, s4]
                loss_mask = self.dice_loss(masks, gt_mask, gt_obj)
                loss_masks.append(loss_mask)

                # ================ 分类损失。sigmoid_focal_loss() ======================
                gamma = self.loss_gamma
                alpha = self.loss_alpha
                pred_conf = cls_preds[lid][
                    bid]  # [80, seg_num_grid, seg_num_grid]    未进行sigmoid()激活。
                pred_conf = L.transpose(pred_conf, perm=[
                    1, 2, 0
                ])  # [seg_num_grid, seg_num_grid, 80]    未进行sigmoid()激活。
                pred_conf = L.sigmoid(
                    pred_conf
                )  # [seg_num_grid, seg_num_grid, 80]    已进行sigmoid()激活。
                gt_clss = batch_gt_clss_tensors[lid][
                    bid]  # [seg_num_grid, seg_num_grid, 80]    真实类别onehot
                gt_clss.stop_gradient = True
                pos_loss = gt_clss * (0 - L.log(pred_conf + 1e-9)) * L.pow(
                    1 - pred_conf, gamma) * alpha
                neg_loss = (
                    1.0 - gt_clss) * (0 - L.log(1 - pred_conf + 1e-9)) * L.pow(
                        pred_conf, gamma) * (1 - alpha)
                focal_loss = pos_loss + neg_loss
                focal_loss = L.reduce_sum(focal_loss, dim=[0, 1])
                loss_clss.append(focal_loss)
        loss_masks = L.concat(loss_masks, axis=0)
        loss_masks = L.reduce_sum(loss_masks) * self.ins_loss_weight
        loss_masks = loss_masks / L.elementwise_max(
            L.ones((1, ), dtype='float32'), num_ins)

        loss_clss = L.concat(loss_clss, axis=0)
        loss_clss = L.reduce_sum(loss_clss) * self.clss_loss_weight
        loss_clss = loss_clss / L.elementwise_max(
            L.ones((1, ), dtype='float32'), num_ins)

        loss_all = {"loss_masks": loss_masks, "loss_clss": loss_clss}
        return loss_all
Exemplo n.º 21
0
def beam_search_infilling(model,
                          q_ids,
                          q_sids,
                          sos_id,
                          eos_id,
                          attn_id,
                          max_encode_len=640,
                          max_decode_len=100,
                          beam_width=5,
                          tgt_type_id=3,
                          length_penalty=1.0):
    model.eval()
    _, __, info = model(q_ids, q_sids)
    d_batch, d_seqlen = q_ids.shape

    state = BeamSearchState(log_probs=L.zeros([d_batch, beam_width],
                                              'float32'),
                            lengths=L.zeros([d_batch, beam_width], 'int64'),
                            finished=L.zeros([d_batch, beam_width], 'int64'))
    outputs = []

    def reorder_(t, parent_id):
        """reorder cache according to parent beam id"""
        gather_idx = L.where(parent_id != -1)[:, 0] * beam_width + L.reshape(
            parent_id, [-1])
        t = L.gather(t, gather_idx)
        return t

    def tile_(t, times):
        _shapes = list(t.shape[1:])
        ret = L.reshape(
            L.expand(L.unsqueeze(t, [1]), [
                1,
                times,
            ] + [
                1,
            ] * len(_shapes)), [
                -1,
            ] + _shapes)
        return ret

    cached_k, cached_v = info['caches']
    cached_k = [tile_(k, beam_width) for k in cached_k]
    cached_v = [tile_(v, beam_width) for v in cached_v]
    past_cache = (cached_k, cached_v)

    q_ids = tile_(q_ids, beam_width)
    seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True)

    cls_ids = L.ones([d_batch * beam_width], dtype='int64') * sos_id
    attn_ids = L.ones([d_batch * beam_width], dtype='int64') * attn_id  # SOS
    ids = L.stack([cls_ids, attn_ids], -1)
    for step in range(max_decode_len):
        bias = gen_bias(q_ids, ids, step)
        pos_ids = D.to_variable(
            np.tile(np.array([[step, step + 1]], dtype=np.int64),
                    [d_batch * beam_width, 1]))
        pos_ids += seqlen
        _, logits, info = model(ids,
                                L.ones_like(ids) * tgt_type_id,
                                pos_ids=pos_ids,
                                attn_bias=bias,
                                past_cache=past_cache)

        output, state = beam_search_step(state,
                                         logits[:, 1],
                                         eos_id=eos_id,
                                         beam_width=beam_width,
                                         is_first_step=(step == 0),
                                         length_penalty=length_penalty)
        outputs.append(output)

        past_cached_k, past_cached_v = past_cache
        cached_k, cached_v = info['caches']
        cached_k = [
            reorder_(L.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids)
            for pk, k in zip(past_cached_k, cached_k)
        ]  # concat cached
        cached_v = [
            reorder_(L.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids)
            for pv, v in zip(past_cached_v, cached_v)
        ]
        past_cache = (cached_k, cached_v)

        pred_ids_flatten = L.reshape(output.predicted_ids,
                                     [d_batch * beam_width])
        ids = L.stack([pred_ids_flatten, attn_ids], 1)

        if state.finished.numpy().all():
            break

    final_ids = L.stack([o.predicted_ids for o in outputs], 0)
    final_parent_ids = L.stack([o.beam_parent_ids for o in outputs], 0)
    final_ids = L.gather_tree(final_ids, final_parent_ids)[:, :,
                                                           0]  # pick best beam
    final_ids = L.transpose(L.reshape(final_ids, [-1, d_batch * 1]), [1, 0])
    return final_ids
    def _init_state(self, inputs):
        """ Initialize decode state. """
        state = {}

        src_token = inputs["src_token"]
        src_mask = inputs["src_mask"]
        src_pos = inputs["src_pos"]
        src_type = inputs["src_type"]
        src_turn = inputs["src_turn"]

        batch_size = src_token.shape[0]
        seq_len = src_token.shape[1]

        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        src_embed = self.embed_layer_norm(src_embed)

        mask = self._create_mask(src_mask, append_head=self.num_latent > 0)

        if self.num_latent > 0:
            src_embed = F.unsqueeze(src_embed, [1])
            src_embed = layers.expand(src_embed, [1, self.num_latent, 1, 1])
            src_embed = layers.reshape(src_embed, [-1, seq_len, self.hidden_dim])

            latent_embed = self.latent_embeddings
            latent_embed = F.unsqueeze(latent_embed, [1])
            latent_embed = layers.expand(latent_embed, [batch_size, 1, 1])
            latent_embed = self.embed_layer_norm(latent_embed)

            enc_out = layers.concat([latent_embed, src_embed], axis=1)

            mask = F.unsqueeze(mask, [1])
            mask = layers.expand(mask, [1, self.num_latent, 1, 1])
            mask = layers.reshape(mask, [-1, seq_len + 1, seq_len + 1])
        else:
            enc_out = src_embed

        cache = {}
        for l, layer in enumerate(self.layers):
            cache[f"layer_{l}"] = {}
            enc_out = layer(enc_out, mask, cache[f"layer_{l}"])

        state["cache"] = cache
        state["mask"] = mask[:, :1]
        if self.num_latent > 0:
            state["batch_size"] = batch_size * self.num_latent
            shape = [batch_size * self.num_latent, 1, 1]
        else:
            state["batch_size"] = batch_size
            shape = [batch_size, 1, 1]
        state["pred_mask"] = layers.ones(shape, self._dtype)
        state["pred_pos"] = layers.zeros(shape, "int64")
        state["pred_type"] = layers.zeros(shape, "int64")
        state["pred_turn"] = layers.zeros(shape, "int64")

        if "tgt_token" in inputs and self.num_latent > 0:
            tgt_token = inputs["tgt_token"][:, :-1]
            tgt_mask = inputs["tgt_mask"][:, :-1]
            tgt_pos = inputs["tgt_pos"][:, :-1]
            tgt_type = inputs["tgt_type"][:, :-1]
            tgt_turn = inputs["tgt_turn"][:, :-1]

            input_mask = layers.concat([src_mask, tgt_mask], axis=1)
            input_mask.stop_gradient = True
            src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
            tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn)
            embed = layers.concat([src_embed, tgt_embed], axis=1)
            embed = self.embed_layer_norm(embed)

            batch_size = src_token.shape[0]
            src_len = src_token.shape[1]
            tgt_len = tgt_token.shape[1]

            post_embed, post_probs, post_logits = self._posteriori_network(
                input_mask, embed, batch_size, src_len, tgt_len)
            state["post_probs"] = post_probs

        return state