Exemplo n.º 1
0
    def forward_grad(self, x):
        #grad_x = F.conv2d(F.pad(x, (0, 0, 0, 1)), self.f_grad)  # , groups=self.channels)
        #grad_y = F.conv2d(F.pad(x, (0, 0, 0, 1)), self.f_grad2)  # , groups=self.channels)
        x1 = fluid.layers.pad2d(x, paddings=[0, 0, 0, 1])
        #pdb.set_trace()

        grad_x = self.conv2df_grad(x1)
        #grad_x[:, :, :, -1] = 0

        temp = unstack(grad_x, axis=3)

        temp[-1] = temp[-1] * 0
        grad_x = stack(temp, axis=3)

        x2 = fluid.layers.pad2d(x, paddings=[0, 1, 0, 0])
        grad_y = self.conv2df_grad2(x2)  # , groups=self.channels)

        #grad_y[:, :, -1, :] = 0
        temp = unstack(grad_y, axis=2)
        temp[-1] = temp[-1] * 0
        grad_y = stack(temp, axis=2)

        bt, c, h, w = grad_x.shape

        grad_x = fluid.layers.reshape(grad_x, [-1, c, h, w])
        grad_y = fluid.layers.reshape(grad_y, [-1, c, h, w])
        return grad_x, grad_y
Exemplo n.º 2
0
def greedy_search_infilling(model,
                            q_ids,
                            q_sids,
                            sos_id,
                            eos_id,
                            attn_id,
                            max_encode_len=640,
                            max_decode_len=100,
                            tgt_type_id=3):
    model.eval()
    _, logits, info = model(q_ids, q_sids)
    gen_ids = L.argmax(logits, -1)
    d_batch, d_seqlen = q_ids.shape
    seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True)
    has_stopped = np.zeros([d_batch], dtype=np.bool)
    gen_seq_len = np.zeros([d_batch], dtype=np.int64)
    output_ids = []

    past_cache = info['caches']

    cls_ids = L.ones([d_batch], dtype='int64') * sos_id
    attn_ids = L.ones([d_batch], dtype='int64') * attn_id
    ids = L.stack([cls_ids, attn_ids], -1)
    for step in range(max_decode_len):
        bias = gen_bias(q_ids, ids, step)
        pos_ids = D.to_variable(
            np.tile(np.array([[step, step + 1]], dtype=np.int64),
                    [d_batch, 1]))
        pos_ids += seqlen
        _, logits, info = model(ids,
                                L.ones_like(ids) * tgt_type_id,
                                pos_ids=pos_ids,
                                attn_bias=bias,
                                past_cache=past_cache)
        gen_ids = L.argmax(logits, -1)

        past_cached_k, past_cached_v = past_cache
        cached_k, cached_v = info['caches']
        cached_k = [
            L.concat([pk, k[:, :1, :]], 1)
            for pk, k in zip(past_cached_k, cached_k)
        ]  # concat cached
        cached_v = [
            L.concat([pv, v[:, :1, :]], 1)
            for pv, v in zip(past_cached_v, cached_v)
        ]
        past_cache = (cached_k, cached_v)

        gen_ids = gen_ids[:, 1]
        ids = L.stack([gen_ids, attn_ids], 1)

        gen_ids = gen_ids.numpy()
        has_stopped |= (gen_ids == eos_id).astype(np.bool)
        gen_seq_len += (1 - has_stopped.astype(np.int64))
        output_ids.append(gen_ids.tolist())
        if has_stopped.all():
            break
    output_ids = np.array(output_ids).transpose([1, 0])
    return output_ids
Exemplo n.º 3
0
    def seq2seq_api_rnn(input_embedding,
                        len=3,
                        init_hiddens=None,
                        init_cells=None):
        class EncoderCell(layers.RNNCell):
            def __init__(self,
                         num_layers,
                         hidden_size,
                         dropout_prob=0.,
                         forget_bias=0.):
                self.num_layers = num_layers
                self.hidden_size = hidden_size
                self.dropout_prob = dropout_prob
                self.lstm_cells = []
                for i in range(num_layers):
                    self.lstm_cells.append(
                        layers.LSTMCell(
                            hidden_size,
                            forget_bias=forget_bias,
                            param_attr=fluid.ParamAttr(
                                initializer=fluid.initializer.
                                UniformInitializer(low=-init_scale,
                                                   high=init_scale))))

            def call(self, step_input, states):
                new_states = []
                for i in range(self.num_layers):
                    out, new_state = self.lstm_cells[i](step_input, states[i])
                    step_input = layers.dropout(
                        out,
                        self.dropout_prob,
                        dropout_implementation='upscale_in_train'
                    ) if self.dropout_prob > 0 else out
                    new_states.append(new_state)
                return step_input, new_states

        cell = EncoderCell(num_layers, hidden_size, dropout)
        output, new_states = layers.rnn(
            cell,
            inputs=input_embedding,
            initial_states=[[hidden, cell] for hidden, cell in zip([
                layers.reshape(init_hidden, shape=[-1, hidden_size])
                for init_hidden in layers.split(
                    init_hiddens, num_or_sections=num_layers, dim=0)
            ], [
                layers.reshape(init_cell, shape=[-1, hidden_size])
                for init_cell in layers.split(
                    init_cells, num_or_sections=num_layers, dim=0)
            ])],
            time_major=False)
        last_hidden = layers.stack([hidden for hidden, _ in new_states], 0)
        last_cell = layers.stack([cell for _, cell in new_states], 0)
        return output, last_hidden, last_cell
    def forward_grad(self, x):
        grad_x = self.conv4u(layers.pad(x, (0, 0, 0, 0, 0, 0, 0, 1)))
        tmp = layers.unstack(grad_x, axis=2)
        tmp[-1] = tmp[-1] - tmp[-1]  #tmp[-1]=0

        grad_x = layers.stack(tmp, axis=2)

        grad_y = self.conv4v(layers.pad(x, (0, 0, 0, 0, 0, 1, 0, 0)))

        tmp = layers.unstack(grad_y, axis=2)
        tmp[-1] = tmp[-1] - tmp[-1]  # tmp[-1]=0
        grad_y = layers.stack(tmp, axis=2)
        return grad_x, grad_y
Exemplo n.º 5
0
        def loop_body(i,
                      mel_input,
                      outputs,
                      hiddens,
                      attentions,
                      state=None,
                      coeffs=None):
            # state is None coeffs is None for the first step
            decoded, hidden, new_coeffs, new_state = self.decoder(
                mel_input, keys, values, text_lengths, i, speaker_embed, state,
                force_monotonic_attention, coeffs, window)
            new_coeffs = F.stack(new_coeffs)  # (N, B, T_dec=1, T_enc)

            attentions.append(new_coeffs)  # (N, B, T_dec=1, T_enc)
            outputs.append(decoded)  # (B, T_dec=1, rC_mel)
            hiddens.append(hidden)  # (B, T_dec=1, C_dec)

            # slice the last frame out of r generated frames to be used as the input for the next step
            batch_size = mel_input.shape[0]
            frames = F.reshape(decoded, [
                batch_size, -1, self.decoder.reduction_factor,
                self.decoder.in_channels
            ])
            input_frame = frames[:, :, -1, :]
            return (i + 1, input_frame, outputs, hiddens, attentions,
                    new_state, new_coeffs)
Exemplo n.º 6
0
def masks_to_boxes(masks):
    """
    Compute the bounding boxes around the provided masks

    The masks should be in format [N, H, W] where N is the number
    of masks, (H, W) are the spatial dimensions.

    Returns a [N, 4] tensors, with the boxes in xyxy format
    """
    if np.sum(masks.shape) == 0:
        return dg.to_variable(np.zeros((0, 4)))

    h, w = masks.shape[-2:]
    y = dg.to_variable(np.arange(0, h, 1, dtype="float32"))
    x = dg.to_variable(np.arange(0, w, 1, dtype="float32"))
    y, x = T.meshgrid([y, x])  # [h, w]

    x_mask = (masks * L.unsqueeze(x, [0]))  # [N, H, W]
    x_max = L.reduce_max(L.flatten(x_mask, axis=1), dim=-1)
    non_mask = dg.to_variable(~masks.numpy())
    x_mask[non_mask] = 1e8
    x_min = L.reduce_min(L.flatten(x_mask, axis=1), dim=-1)

    y_mask = (masks * L.unsqueeze(y, [0]))  # [N, H, W]
    y_max = L.reduce_max(L.flatten(y_mask, axis=1), dim=-1)
    y_mask[non_mask] = 1e8
    y_min = L.reduce_min(L.flatten(y_mask, axis=1), dim=-1)

    return L.stack([x_min, y_min, x_max, y_max], 1)
Exemplo n.º 7
0
        def __call__(self, msg):
            alpha = msg["alpha"]  # lod-tensor (batch_size, num_heads)
            if attn_drop:
                old_h = alpha
                dropout = F.data(name='attn_drop', shape=[1], dtype="int64")
                u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'),
                                     min=0.,
                                     max=1.)
                keeped = L.cast(u > dropout, dtype="float32")
                self_attn_mask = L.scale(x=keeped,
                                         scale=10000.0,
                                         bias=-1.0,
                                         bias_after_scale=False)
                n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads,
                                                axis=1)
                n_head_self_attn_mask.stop_gradient = True
                alpha = n_head_self_attn_mask + alpha
                alpha = L.lod_reset(alpha, old_h)

            h = msg["v"]
            alpha = paddle_helper.sequence_softmax(alpha)

            self.alpha = alpha
            old_h = h
            h = h * alpha
            h = L.lod_reset(h, old_h)
            h = L.sequence_pool(h, "sum")

            if concat:
                h = L.reshape(h, [-1, num_heads * hidden_size])
            else:
                h = L.reduce_mean(h, dim=1)
            return h
Exemplo n.º 8
0
def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen):

    input_shape = L.shape(input_mask)
    input_batch = input_shape[0]
    input_seqlen = input_shape[1]
    num_slot = input_seqlen / slot_seqlen
    num_b = num_slot - 1
    ones = L.ones([num_b], dtype="float32")  # [num_b]
    diag_ones = L.diag(ones)  # [num_b, num_b]
    diag_ones = L.unsqueeze(diag_ones, [1, -1])  # [num_b, 1, num_b, 1]
    diag_ones = L.expand(
        diag_ones,
        [1, slot_seqlen, 1, slot_seqlen])  # [num_b, seqlen, num_b, seqlen]
    diag_ones = L.reshape(diag_ones,
                          [1, num_b * slot_seqlen, num_b * slot_seqlen
                           ])  # [1, num_b*seqlen, num_b*seqlen]

    graph_attn_bias = L.concat([
        L.ones([1, num_b * slot_seqlen, slot_seqlen], dtype="float32"),
        diag_ones
    ], 2)
    graph_attn_bias = L.concat([
        L.ones([1, slot_seqlen, num_slot * slot_seqlen], dtype="float32"),
        graph_attn_bias
    ], 1)  # [1, seq, seq]

    pad_attn_bias = L.matmul(input_mask, input_mask,
                             transpose_y=True)  # [batch, seq, seq]
    attn_bias = graph_attn_bias * pad_attn_bias

    attn_bias = (1. - attn_bias) * -10000.
    attn_bias = L.stack([attn_bias] * n_head, 1)  # [batch, n_head, seq, seq]
    if attn_bias.dtype != dtype:
        attn_bias = L.cast(attn_bias, dtype)
    return attn_bias
Exemplo n.º 9
0
def build_attn_bias(input_mask, n_head, dtype):
    attn_bias = L.matmul(input_mask, input_mask,
                         transpose_y=True)  # [batch, seq, seq]
    attn_bias = (1. - attn_bias) * -10000.
    attn_bias = L.stack([attn_bias] * n_head, 1)  # [batch, n_head, seq, seq]
    if attn_bias.dtype != dtype:
        attn_bias = L.cast(attn_bias, dtype)
    return attn_bias
Exemplo n.º 10
0
    def forward(self, tensor_list: NestedTensor):
        x = tensor_list.tensors
        mask = tensor_list.mask
        assert mask is not None
        bs, h, w = mask.shape

        mask = mask.numpy()
        not_mask = ~mask
        not_mask = dg.to_variable(not_mask).astype('float32')
        y_embed = L.cumsum(not_mask, axis=1)  # [batch_size, h, w]
        x_embed = L.cumsum(not_mask, axis=2)  # [batch_size, h, w]
        if self.normalize:
            eps = 1e-6
            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = (np.arange(0, self.num_pos_feats, 1,
                           dtype="float32"))  # [num_pos_feats]
        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats
                                   )  # [num_pos_feats]
        dim_t = dg.to_variable(dim_t)

        x_embed = L.unsqueeze(x_embed, 3)  # [batch_size, h, w, 1]
        y_embed = L.unsqueeze(y_embed, 3)  # [batch_size, h, w, 1]
        pos_x = x_embed / dim_t  # [batch_size, h, w, num_pos_feats]
        pos_y = y_embed / dim_t  # [batch_size, h, w, num_pos_feats]
        pos_x_1 = L.sin(pos_x[:, :, :,
                              0::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_x_2 = L.cos(pos_x[:, :, :,
                              1::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_y_1 = L.sin(pos_y[:, :, :,
                              0::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_y_2 = L.cos(pos_y[:, :, :,
                              1::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_x = L.reshape(L.stack([pos_x_1, pos_x_2], axis=4),
                          (bs, h, w, -1))  # [batch_size, h, w, num_pos_feats]
        pos_y = L.reshape(L.stack([pos_y_1, pos_y_2], axis=4),
                          (bs, h, w, -1))  # [batch_size, h, w, num_pos_feats]

        pos = L.concat((pos_y, pos_x),
                       axis=3)  # [batch_size, h, w, num_pos_feats * 2]
        pos = L.transpose(pos,
                          perm=(0, 3, 1,
                                2))  # [batch_size, num_pos_feats * 2, h, w]
        return pos
Exemplo n.º 11
0
def update_loss_scale(grads):
    state = mixed_precision_global_state()
    if state is None or not state.dynamic_scaling:
        return
    per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads])
    grad_valid = layers.isfinite(per_grad_check)
    layers.cond(grad_valid, lambda: state.increment(),
                lambda: state.decrement())
    return grad_valid
Exemplo n.º 12
0
    def _gen_input(self,
                   token_ids,
                   type_ids,
                   pos_ids,
                   input_mask,
                   aux_emb=None):
        token_emb_out = layers.embedding(
            input=token_ids,
            size=[self.vocab_size, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.token_emb_name,
                                       initializer=self.param_initializer))
        type_emb_out = layers.embedding(
            input=type_ids,
            size=[self.type_size, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.type_emb_name,
                                       initializer=self.param_initializer))
        pos_emb_out = layers.embedding(
            input=pos_ids,
            size=[self.max_position_seq_len, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.pos_emb_name,
                                       initializer=self.param_initializer))
        emb_out = token_emb_out + type_emb_out + pos_emb_out

        # auxiliary memory embeddings
        if aux_emb is not None:
            emb_out = layers.concat([aux_emb, emb_out], axis=1)

        # post process of embedding
        emb_out = pre_process_layer(emb_out,
                                    self.pre_encoder_cmd,
                                    self.prepostprocess_dropout,
                                    name="pre_encoder",
                                    epsilon=self.epsilon)
        if self.emb_mapping_in:
            emb_out = layers.fc(input=emb_out,
                                num_flatten_dims=2,
                                size=self.hidden_size,
                                param_attr=fluid.ParamAttr(
                                    name="emb_hidden_mapping",
                                    initializer=self.param_initializer),
                                bias_attr="emb_hidden_mapping_bias")

        # generate n-head self-attention mask
        self_attn_mask = input_mask
        self_attn_mask = layers.scale(x=self_attn_mask,
                                      scale=1e4,
                                      bias=-1.0,
                                      bias_after_scale=False)
        n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head,
                                             axis=1)
        n_head_self_attn_mask.stop_gradient = True

        return emb_out, n_head_self_attn_mask
Exemplo n.º 13
0
 def pad(self, input_ele):
     max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
     out_list = []
     for i in range(len(input_ele)):
         pad_len = max_len - input_ele[i].shape[0]
         one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
                                       pad_value=0.0)
         out_list.append(one_batch_padded)
     out_padded = layers.stack(out_list)
     return out_padded
Exemplo n.º 14
0
    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                pos=None,
                query_pos=None):
        output = tgt

        intermediate = []

        assert tgt_mask is None, "Not implement compute tgt_mask's attn_mask."

        if memory_mask is not None:
            bs, tgt_length = tgt.shape[:2]
            memory_length = memory.shape[1]
            attn_mask = L.zeros([bs, tgt_length, memory_length],
                                dtype="float32")
            memory_mask = L.expand(
                L.unsqueeze(memory_mask, [1]),
                (1, tgt_length, 1))  # [bs, tgt_length, memory_length]
            attn_mask = attn_mask.numpy()
            memory_mask = memory_mask.numpy()
            attn_mask[memory_mask] = -1e8
            attn_mask = dg.to_variable(attn_mask)
            attn_mask = L.expand(L.unsqueeze(attn_mask, [1]),
                                 (1, self.nhead, 1,
                                  1))  # [bs, nhead, tgt_length, memory_length]
            memory_mask = attn_mask

        attention_weight = []
        for layer in self.layers:
            output, self_attn_weights, multihead_attn_weights = layer(
                output,
                memory,
                tgt_mask=tgt_mask,
                memory_mask=memory_mask,
                pos=pos,
                query_pos=query_pos)

            attention_weight.append(
                (self_attn_weights, multihead_attn_weights))
            if self.return_intermediate:
                intermediate.append(self.norm(output))

        if self.norm is not None:
            output = self.norm(output)
            if self.return_intermediate:
                intermediate.pop()
                intermediate.append(output)

        if self.return_intermediate:
            return L.stack(intermediate), attention_weight

        return L.unsqueeze(output, [0]), attention_weight
Exemplo n.º 15
0
    def forward(self, x, seq_mask, pad_index, hx=None):
        """Forward network"""
        x, batch_sizes, sorted_indices = self.pack_padded_sequence(
            x, seq_mask, pad_index)
        _, unsorted_indices = layers.argsort(sorted_indices)
        batch_size = batch_sizes[0]
        h_n, c_n = [], []

        if hx is None:
            ih = layers.zeros(shape=(self.num_layers * 2, batch_size,
                                     self.hidden_size),
                              dtype=x[0].dtype)
            h, c = ih, ih
        else:
            h, c = self.permute_hidden(hx, sorted_indices)
        h = layers.reshape(h, shape=(self.num_layers, 2, -1, self.hidden_size))
        c = layers.reshape(c, shape=(self.num_layers, 2, -1, self.hidden_size))

        for i in range(self.num_layers):
            x = layers.split(x, batch_sizes, dim=0)
            if self.training and self.dropout > 0:
                mask = SharedDropout.get_mask(x[0], self.dropout)
                x = [j * mask[:len(j)] for j in x]
            x_f, (h_f, c_f) = self.layer_forward(x=x,
                                                 hx=(h[i, 0], c[i, 0]),
                                                 cell=self.f_cells[i],
                                                 batch_sizes=batch_sizes)
            x_b, (h_b, c_b) = self.layer_forward(x=x,
                                                 hx=(h[i, 1], c[i, 1]),
                                                 cell=self.b_cells[i],
                                                 batch_sizes=batch_sizes,
                                                 reverse=True)
            x = layers.concat((x_f, x_b), axis=-1)
            h_n.append(layers.stack((h_f, h_b)))
            c_n.append(layers.stack((c_f, c_b)))
        x = self.pad_packed_sequence(x, batch_sizes, unsorted_indices)
        hx = layers.concat(h_n, axis=0), layers.concat(c_n, axis=0)
        hx = self.permute_hidden(hx, unsorted_indices)

        return x, hx
Exemplo n.º 16
0
def update_loss_scale(grads):
    state = mixed_precision_global_state()
    if state is None or not state.dynamic_scaling:
        return
    per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads])
    grad_valid = layers.isfinite(per_grad_check)

    with layers.Switch() as switch:
        with switch.case(grad_valid):
            state.increment()
        with switch.default():
            state.decrement()
    return grad_valid
Exemplo n.º 17
0
    def _ranking(self, inputs, predictions):
        """ Reranking generated responses. """
        src_token = inputs["src_token"]
        src_mask = inputs["src_mask"]
        src_pos = inputs["src_pos"]
        src_type = inputs["src_type"]
        src_turn = inputs["src_turn"]
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)

        batch_size, num_latent, tgt_seq_len = predictions.shape

        # shape: [batch_size, num_latent, seq_len, 1]
        preds_token = F.unsqueeze(predictions, [3])
        preds_mask = F.not_equal(preds_token, self.padding_idx, "int64")
        preds_pos = layers.range(0, tgt_seq_len, 1, dtype="float32")
        preds_pos = F.unsqueeze(preds_pos, [0, 0, 1])
        preds_pos = layers.expand(preds_pos, [batch_size, num_latent, 1, 1])
        preds_pos = layers.cast(preds_pos, "int64")
        preds_type = layers.zeros_like(preds_token)
        preds_turn = layers.zeros_like(preds_token)

        scores = []
        for i in range(num_latent):
            pred_token = preds_token[:, i]
            pred_mask = preds_mask[:, i]
            pred_pos = preds_pos[:, i]
            pred_type = preds_type[:, i]
            pred_turn = preds_turn[:, i]

            input_mask = layers.concat([src_mask, pred_mask], axis=1)
            input_mask.stop_gradient = True
            pred_embed = self.embedder(pred_token, pred_pos, pred_type,
                                       pred_turn)
            embed = layers.concat([src_embed, pred_embed], axis=1)
            embed = self.embed_layer_norm(embed)

            mask_embed = self.mask_embed
            mask_embed = layers.expand(mask_embed, [batch_size, 1, 1])
            mask_embed = self.embed_layer_norm(mask_embed)

            out = layers.concat([mask_embed, embed], axis=1)
            mask = self._create_mask(input_mask, append_head=True)

            for layer in self.layers:
                out = layer(out, mask, None)

            mask_embed = out[:, 0]
            score = self.discriminator(mask_embed)
            scores.append(score[:, 0])
        scores = layers.stack(scores, axis=1)
        return scores
Exemplo n.º 18
0
 def pad_packed_sequence(self, x, batch_sizes, unsorted_indices):
     """Pads a packed sequences."""
     h_size = x.shape[1]
     split_x = layers.split(x, batch_sizes, dim=0)
     max_bs = batch_sizes[0]
     step_embs = []
     for step, cur_bs in enumerate(batch_sizes):
         pad_emb = layers.zeros(shape=(max_bs - cur_bs, h_size),
                                dtype=x.dtype)
         step_emb = layers.concat(input=(split_x[step], pad_emb))
         step_embs.append(step_emb)
     new_x = layers.stack(step_embs, axis=1)
     new_x = layers.index_select(new_x, unsorted_indices)
     return new_x
Exemplo n.º 19
0
    def forward(self, x, adj):
        """Forward network"""

        x = layers.dropout(x, self.dropout)
        if self.layer == 1:
            x = layers.stack([att.forward(x, adj) for att in self.attentions],
                             dim=2)
            x = layers.reduce_sum(x, 2)
            x = layers.dropout(x, self.dropout)
            return layers.log_softmax(x, axis=2)
        else:
            x = layers.concat([att.forward(x, adj) for att in self.attentions],
                              axis=2)
            x = layers.dropout(x, self.dropout)
            return self.out_att.forward(x, adj)
Exemplo n.º 20
0
def pad_sequence_paddle(sequences, padding_value=0):
    """Fill sequences(variable) into a fixed-length matrix"""
    max_size = sequences[0].shape
    trailing_dims = max_size[1:]
    max_len = max([s.shape[0] for s in sequences])
    out_tensor = []
    for tensor in sequences:
        length = tensor.shape[0]
        pad_tensor = layers.concat((tensor,
                                    layers.fill_constant(
                                        (max_len - length, *trailing_dims),
                                        dtype=tensor.dtype,
                                        value=padding_value)))
        out_tensor.append(pad_tensor)
    out_tensor = layers.stack(out_tensor)
    return out_tensor
Exemplo n.º 21
0
    def teacher_forced_train(self, keys, values, text_lengths, speaker_embed, mel):
        # build decoder inputs by shifting over by one frame and add all zero <start> frame
        # the mel input is downsampled by a reduction factor
        batch_size = mel.shape[0]
        mel_input = F.reshape(mel, (batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels))
        zero_frame = F.zeros((batch_size, 1, self.decoder.in_channels), dtype="float32")
        # downsample mel input as a regularization
        mel_input = F.concat([zero_frame, mel_input[:, :-1, -1, :]], axis=1)

        # decoder
        decoded, hidden, attentions, final_state = self.decoder(mel_input, keys, values, text_lengths, 0, speaker_embed)
        attentions = F.stack(attentions) # (N, B, T_dec, T_encs)
        # unfold frames
        decoded = F.reshape(decoded, (batch_size, -1, self.decoder.in_channels))
        # postnet
        refined = self.postnet(hidden, speaker_embed)
        return decoded, refined, attentions, final_state
Exemplo n.º 22
0
def pick_image(images, idx):
    """
    Pick the image among images according to idx.

    Args:
        images (B x N x C x H x W), N images, 
        idx (B ) indices to select.
    """
    if type(images) == list:
        return [pick_image(r, idx) for r in images]
    if idx is None:
        return images[:, 0]
    elif type(idx) == int:
        return images[:, idx]

    idx = idx.astype('long').numpy()
    images = L.stack([images[i][int(idx[i])] for i in range(images.shape[0])])
    return images
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed(2)

    x = layers.assign(
        np.random.rand(batch_size, beam_size, 32).astype("float32"))
    indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices")
    step_idx = layers.fill_constant(
        shape=[1], dtype="int64", value=0, force_cpu=True)
    max_len = layers.fill_constant(
        shape=[1], dtype="int64", value=10, force_cpu=True)
    cond = layers.less_than(x=step_idx, y=max_len)
    while_op = layers.While(cond)
    scores = layers.array_write(x, step_idx)
    with while_op.block():
        bs = layers.cast(layers.shape(x)[0], "int64")
        for _ in range(20):
            bs = layers.cast(bs, 'int64')
        bs.stop_gradient = stop_gradient
        batch_pos = layers.expand(
            layers.unsqueeze(
                layers.range(
                    0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size])
        topk_coordinates = layers.stack([batch_pos, indices], axis=2)
        topk_coordinates.stop_gradient = stop_gradient
        score = layers.gather_nd(x, topk_coordinates)
        layers.increment(x=step_idx, value=1.0, in_place=True)
        layers.array_write(score, i=step_idx, array=scores)
        length_cond = layers.less_than(x=step_idx, y=max_len)
        layers.assign(length_cond, cond)

    out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
    loss = layers.reduce_mean(out)
    opt = fluid.optimizer.Adam(0.01)
    opt.minimize(loss)
    exe = fluid.Executor(place)
    data = np.random.random_integers(
        low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64")
    loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss])

    return loss_val
Exemplo n.º 24
0
    def forward(self, indices, speaker_position_rate=None):
        """
        Args:
            indices (Variable): shape (B, T), dtype: int64, position
                indices, where B means the batch size, T means the time steps.
            speaker_position_rate (Variable | float, optional), position
                rate. It can be a float point number or a Variable with 
                shape (1,), then this speaker_position_rate is used for every 
                example. It can also be a Variable with shape (B, ), which 
                contains a speaker position rate for each utterance.
        Returns:
            out (Variable): shape(B, T, C_pos), dtype float32, position embedding, where C_pos 
                means position embedding size.
        """
        batch_size, time_steps = indices.shape

        # convert speaker_position_rate to a Variable with shape(B, )
        if isinstance(speaker_position_rate, float):
            speaker_position_rate = dg.to_variable(
                np.array([speaker_position_rate]).astype("float32"))
            speaker_position_rate = F.expand(speaker_position_rate,
                                             [batch_size])
        elif isinstance(speaker_position_rate, fluid.framework.Variable) \
            and list(speaker_position_rate.shape) == [1]:
            speaker_position_rate = F.expand(speaker_position_rate,
                                             [batch_size])
        assert len(speaker_position_rate.shape) == 1 and \
            list(speaker_position_rate.shape) == [batch_size]

        weight = compute_position_embedding(self.weight,
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
            F.unsqueeze(
                F.range(
                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)

        out = F.gather_nd(weight, gather_nd_id)
        return out
Exemplo n.º 25
0
def crop(x, audio_start, audio_length):
    """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.

    Args:
        x (Variable): shape(B, C, T), dtype float32, the upsample condition.
        audio_start (Variable): shape(B, ), dtype: int64, the index the starting point.
        audio_length (int): the length of the audio (number of samples it contaions).

    Returns:
        Variable: shape(B, C, audio_length), cropped condition.
    """
    # crop audio
    slices = []  # for each example
    starts = audio_start.numpy()
    for i in range(x.shape[0]):
        start = starts[i]
        end = start + audio_length
        slice = F.slice(x[i], axes=[1], starts=[start], ends=[end])
        slices.append(slice)
    out = F.stack(slices)
    return out
Exemplo n.º 26
0
    def forward(self, outputs, target_sizes):
        """
        Perform the computation
        Parameters:
            outputs: raw outputs of the model
            target_sizes: tensor of dimension [batch_size x 2] containing the size of each image
                          For evaluation, this must be the original image size (before any data augmentation)
                          For visualization, this should be the image size after data augment, but before padding
        """
        out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"]

        assert len(out_logits) == len(target_sizes)
        assert target_sizes.shape[1] == 2

        prob = L.softmax(out_logits, -1)  # [bs, num_queries, num_classes + 1]
        labels = L.argmax(prob[:, :, :], axis=-1)  # [bs, num_queries]
        scores = L.reduce_max(prob, dim=-1)  # [bs, num_queries]

        # convert to [x0, y0, x1, y1] format
        bs, num_queries, _ = out_bbox.shape
        out_bbox = L.reshape(out_bbox, (-1, 4))
        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
        boxes = L.reshape(boxes, (bs, num_queries, 4))
        # and fromm relative [0, 1] to absolute [0, height] coordinates
        img_h, img_w = target_sizes[:, 0], target_sizes[:, 1]
        scale_fct = L.stack([img_w, img_h, img_w, img_h], 1)  # [bs, 4]
        scale_fct = L.expand(L.unsqueeze(scale_fct, [1]), (1, num_queries, 1))
        boxes = boxes * scale_fct

        results = [{
            'scores': s,
            'labels': l,
            'boxes': b
        } for s, l, b in zip(scores.numpy(), labels.numpy(), boxes.numpy())]

        return results
Exemplo n.º 27
0
def beam_search_infilling(model,
                          q_ids,
                          q_sids,
                          sos_id,
                          eos_id,
                          attn_id,
                          max_encode_len=640,
                          max_decode_len=100,
                          beam_width=5,
                          tgt_type_id=3,
                          length_penalty=1.0):
    model.eval()
    _, __, info = model(q_ids, q_sids)
    d_batch, d_seqlen = q_ids.shape

    state = BeamSearchState(log_probs=L.zeros([d_batch, beam_width],
                                              'float32'),
                            lengths=L.zeros([d_batch, beam_width], 'int64'),
                            finished=L.zeros([d_batch, beam_width], 'int64'))
    outputs = []

    def reorder_(t, parent_id):
        """reorder cache according to parent beam id"""
        gather_idx = L.where(parent_id != -1)[:, 0] * beam_width + L.reshape(
            parent_id, [-1])
        t = L.gather(t, gather_idx)
        return t

    def tile_(t, times):
        _shapes = list(t.shape[1:])
        ret = L.reshape(
            L.expand(L.unsqueeze(t, [1]), [
                1,
                times,
            ] + [
                1,
            ] * len(_shapes)), [
                -1,
            ] + _shapes)
        return ret

    cached_k, cached_v = info['caches']
    cached_k = [tile_(k, beam_width) for k in cached_k]
    cached_v = [tile_(v, beam_width) for v in cached_v]
    past_cache = (cached_k, cached_v)

    q_ids = tile_(q_ids, beam_width)
    seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True)

    cls_ids = L.ones([d_batch * beam_width], dtype='int64') * sos_id
    attn_ids = L.ones([d_batch * beam_width], dtype='int64') * attn_id  # SOS
    ids = L.stack([cls_ids, attn_ids], -1)
    for step in range(max_decode_len):
        bias = gen_bias(q_ids, ids, step)
        pos_ids = D.to_variable(
            np.tile(np.array([[step, step + 1]], dtype=np.int64),
                    [d_batch * beam_width, 1]))
        pos_ids += seqlen
        _, logits, info = model(ids,
                                L.ones_like(ids) * tgt_type_id,
                                pos_ids=pos_ids,
                                attn_bias=bias,
                                past_cache=past_cache)

        output, state = beam_search_step(state,
                                         logits[:, 1],
                                         eos_id=eos_id,
                                         beam_width=beam_width,
                                         is_first_step=(step == 0),
                                         length_penalty=length_penalty)
        outputs.append(output)

        past_cached_k, past_cached_v = past_cache
        cached_k, cached_v = info['caches']
        cached_k = [
            reorder_(L.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids)
            for pk, k in zip(past_cached_k, cached_k)
        ]  # concat cached
        cached_v = [
            reorder_(L.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids)
            for pv, v in zip(past_cached_v, cached_v)
        ]
        past_cache = (cached_k, cached_v)

        pred_ids_flatten = L.reshape(output.predicted_ids,
                                     [d_batch * beam_width])
        ids = L.stack([pred_ids_flatten, attn_ids], 1)

        if state.finished.numpy().all():
            break

    final_ids = L.stack([o.predicted_ids for o in outputs], 0)
    final_parent_ids = L.stack([o.beam_parent_ids for o in outputs], 0)
    final_ids = L.gather_tree(final_ids, final_parent_ids)[:, :,
                                                           0]  # pick best beam
    final_ids = L.transpose(L.reshape(final_ids, [-1, d_batch * 1]), [1, 0])
    return final_ids
Exemplo n.º 28
0
    def beam_search(self,
                    src_word,
                    src_pos,
                    src_slf_attn_bias,
                    trg_word,
                    trg_src_attn_bias,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_len=256):
        def expand_to_beam_size(tensor, beam_size):
            tensor = layers.reshape(tensor,
                                    [tensor.shape[0], 1] + tensor.shape[1:])
            tile_dims = [1] * len(tensor.shape)
            tile_dims[1] = beam_size
            return layers.expand(tensor, tile_dims)

        def merge_batch_beams(tensor):
            return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] +
                                  tensor.shape[2:])

        def split_batch_beams(tensor):
            return fluid.layers.reshape(tensor,
                                        shape=[-1, beam_size] +
                                        list(tensor.shape[1:]))

        def mask_probs(probs, finished, noend_mask_tensor):
            # TODO: use where_op
            finished = layers.cast(finished, dtype=probs.dtype)
            probs = layers.elementwise_mul(layers.expand(
                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
                                           noend_mask_tensor,
                                           axis=-1) - layers.elementwise_mul(
                                               probs, (finished - 1), axis=0)
            return probs

        def gather(x, indices, batch_pos):
            topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
            return layers.gather_nd(x, topk_coordinates)

        # run encoder
        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)

        # constant number
        inf = float(1. * 1e7)
        batch_size = enc_output.shape[0]
        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
        vocab_size_tensor = layers.fill_constant(shape=[1],
                                                 dtype="int64",
                                                 value=self.trg_vocab_size)
        end_token_tensor = to_variable(
            np.full([batch_size, beam_size], eos_id, dtype="int64"))
        noend_array = [-inf] * self.trg_vocab_size
        noend_array[eos_id] = 0
        noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
        batch_pos = layers.expand(
            layers.unsqueeze(
                to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
            [1, beam_size])

        predict_ids = []
        parent_ids = []
        ### initialize states of beam search ###
        log_probs = to_variable(
            np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
                     dtype="float32"))
        finished = to_variable(
            np.full([batch_size, beam_size], 0, dtype="bool"))
        ### initialize inputs and states of transformer decoder ###
        ## init inputs for decoder, shaped `[batch_size*beam_size, ...]`
        trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                        dtype="int64",
                                        value=bos_id)
        trg_pos = layers.zeros_like(trg_word)
        trg_src_attn_bias = merge_batch_beams(
            expand_to_beam_size(trg_src_attn_bias, beam_size))
        enc_output = merge_batch_beams(
            expand_to_beam_size(enc_output, beam_size))
        ## init states (caches) for transformer, need to be updated according to selected beam
        caches = [{
            "k":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_key],
                dtype=enc_output.dtype,
                value=0),
            "v":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_value],
                dtype=enc_output.dtype,
                value=0),
        } for i in range(self.n_layer)]

        for i in range(max_len):
            trg_pos = layers.fill_constant(shape=trg_word.shape,
                                           dtype="int64",
                                           value=i)
            caches = map_structure(  # can not be reshaped since the 0 size
                lambda x: x if i == 0 else merge_batch_beams(x), caches)
            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                  enc_output, caches)
            caches = map_structure(split_batch_beams, caches)
            step_log_probs = split_batch_beams(
                fluid.layers.log(fluid.layers.softmax(logits)))
            step_log_probs = mask_probs(step_log_probs, finished,
                                        noend_mask_tensor)
            log_probs = layers.elementwise_add(x=step_log_probs,
                                               y=log_probs,
                                               axis=0)
            log_probs = layers.reshape(log_probs,
                                       [-1, beam_size * self.trg_vocab_size])
            scores = log_probs
            topk_scores, topk_indices = fluid.layers.topk(input=scores,
                                                          k=beam_size)
            beam_indices = fluid.layers.elementwise_floordiv(
                topk_indices, vocab_size_tensor)
            token_indices = fluid.layers.elementwise_mod(
                topk_indices, vocab_size_tensor)

            # update states
            caches = map_structure(
                lambda x: gather(x, beam_indices, batch_pos), caches)
            log_probs = gather(log_probs, topk_indices, batch_pos)
            finished = gather(finished, beam_indices, batch_pos)
            finished = layers.logical_or(
                finished, layers.equal(token_indices, end_token_tensor))
            trg_word = layers.reshape(token_indices, [-1, 1])

            predict_ids.append(token_indices)
            parent_ids.append(beam_indices)

            if layers.reduce_all(finished).numpy():
                break

        predict_ids = layers.stack(predict_ids, axis=0)
        parent_ids = layers.stack(parent_ids, axis=0)
        finished_seq = layers.transpose(
            layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores
Exemplo n.º 29
0
    def forward(self):
        """Build the GATNE net.
        """
        param_attr_init = fluid.initializer.Uniform(
            low=-1.0, high=1.0, seed=np.random.randint(100))
        embed_param_attrs = fluid.ParamAttr(name='Base_node_embed',
                                            initializer=param_attr_init)

        # node_embeddings
        base_node_embed = fl.embedding(
            input=fl.reshape(self.train_inputs, shape=[-1, 1]),
            size=[self.num_nodes, self.embedding_size],
            param_attr=embed_param_attrs)

        node_features = []
        for edge_type in self.edge_types:
            param_attr_init = fluid.initializer.Uniform(
                low=-1.0, high=1.0, seed=np.random.randint(100))
            embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' %
                                                edge_type,
                                                initializer=param_attr_init)

            features = fl.embedding(
                input=self.gw[edge_type].node_feat['index'],
                size=[self.num_nodes, self.embedding_u_size],
                param_attr=embed_param_attrs)

            node_features.append(features)

        # mp_output: list of embedding(self.num_nodes, dim)
        mp_output = self.message_passing(self.gw, self.edge_types,
                                         node_features)

        # U : (num_type[m], num_nodes, dim[s])
        node_type_embed = fl.stack(mp_output, axis=0)

        # U : (num_nodes, num_type[m], dim[s])
        node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2])

        #gather node_type_embed from train_inputs
        node_type_embed = fl.gather(node_type_embed, self.train_inputs)

        # M_r
        trans_weights = fl.create_parameter(
            shape=[
                self.edge_type_count, self.embedding_u_size,
                self.embedding_size // self.att_head
            ],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w')

        # W_r
        trans_weights_s1 = fl.create_parameter(
            shape=[self.edge_type_count, self.embedding_u_size, self.dim_a],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s1')

        # w_r
        trans_weights_s2 = fl.create_parameter(
            shape=[self.edge_type_count, self.dim_a, self.att_head],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s2')

        trans_w = fl.gather(trans_weights, self.train_types)
        trans_w_s1 = fl.gather(trans_weights_s1, self.train_types)
        trans_w_s2 = fl.gather(trans_weights_s2, self.train_types)

        attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2)
        node_type_embed = fl.matmul(attention, node_type_embed)
        node_embed = base_node_embed + fl.reshape(
            fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size])

        self.last_node_embed = fl.l2_normalize(node_embed, axis=1)

        nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer(
            loc=0.0, scale=1.0 / math.sqrt(self.embedding_size))
        nce_weight_attrs = fluid.ParamAttr(name='nce_weight',
                                           initializer=nce_weight_initializer)

        weight_pos = fl.embedding(input=self.train_labels,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        weight_neg = fl.embedding(input=self.train_negs,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1])
        pos_logits = fl.matmul(tmp_node_embed, weight_pos,
                               transpose_y=True)  # [B, 1, 1]

        neg_logits = fl.matmul(tmp_node_embed, weight_neg,
                               transpose_y=True)  # [B, 1, neg_num]

        pos_score = fl.squeeze(pos_logits, axes=[1])
        pos_score = fl.clip(pos_score, min=-10, max=10)
        pos_score = -1.0 * fl.logsigmoid(pos_score)

        neg_score = fl.squeeze(neg_logits, axes=[1])
        neg_score = fl.clip(neg_score, min=-10, max=10)
        neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)

        neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)
        self.loss = fl.reduce_mean(pos_score + neg_score)
Exemplo n.º 30
0
    def plot_results(self, samples, outputs, targets):
        # samples: [batch_size, 3, H, W]
        samples = [sample.numpy() for sample in samples]

        target_sizes = L.stack([t["size"] for t in targets], 0)

        results = self.postprocessor(outputs, target_sizes)

        for i, item in enumerate(zip(samples, results, targets)):
            image, result, target = item
            image = np.transpose(image, (1, 2, 0))
            std = np.array([0.229, 0.224, 0.225])
            mean = np.array([0.485, 0.456, 0.406])
            image = (image * std + mean) * 255
            image = image.astype(np.uint8)[:, :, ::-1]  # RGB -> BGR
            targ_image = image.copy()
            pred_img = image.copy()

            colors = [
                (0, 0, 255),
                (0, 255, 0),
                (255, 0, 0),
                (255, 255, 0),
                (255, 0, 255),
                (0, 255, 255),
                (0, 0, 128),
                (0, 128, 0),
                (128, 0, 0),
                (128, 128, 0),
                (128, 0, 128),
                (0, 128, 128),
            ]
            rect_num = len(target["boxes"])
            colors = colors * math.ceil(rect_num / 12)

            h, w = target["size"].numpy()
            for i, item in enumerate(zip(target["labels"], target["boxes"])):
                l, box = item
                color = colors[i]
                box = L.unsqueeze(box, [0])
                box = box_cxcywh_to_xyxy(box)  # [1, 4]
                box = L.squeeze(box, [0])  # [4]
                box = (box.numpy() * np.array([w, h, w, h])).astype(np.int)
                left_top, bottom_down = (box[0], box[1]), (box[2], box[3])
                cv2.rectangle(targ_image, left_top, bottom_down, color, 2)
                l = l.numpy()[0]
                if isinstance(self.label_to_text, dict):
                    label_name = self.label_to_text.get(str(l), str(l))
                else:
                    if l < len(self.label_to_text):
                        label_name = self.label_to_text[l]
                    else:
                        label_name = str(l)

                cv2.putText(targ_image, label_name, left_top,
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

            rect_num = len(result["labels"])
            colors = colors * math.ceil(rect_num / 12)
            for i, item in enumerate(
                    zip(result["scores"], result["labels"], result["boxes"])):
                s, l, box = item

                if l == self.background:
                    continue

                color = colors[i]
                left_top, bottom_down = (box[0], box[1]), (box[2], box[3])
                cv2.rectangle(pred_img, left_top, bottom_down, color, 2)

                if isinstance(self.label_to_text, dict):
                    label_name = self.label_to_text.get(str(l), str(l))
                else:
                    if l < len(self.label_to_text):
                        label_name = self.label_to_text[l]
                    else:
                        label_name = str(l)

                cv2.putText(pred_img, label_name + " [" + str(s)[:4] + "]",
                            left_top, cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

            show_image = np.concatenate((targ_image, pred_img), 1)
            cv2.imwrite(
                os.path.join(self.output_dir,
                             str(self.index) + ".jpg"), show_image)
            self.index = (self.index + 1) % self.pool_size