示例#1
0
 def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10):
     u = layers.uniform_random_batch_size_like(
         logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0)
     u.stop_gradient = True
     gumbel = 0.0 - layers.log(eps - layers.log(u + eps))
     y = logits + gumbel
     return layers.softmax(y / tau)
def gumbel_softmax(input, tau=1, eps=1e-10):
    """ Basic implement of gumbel_softmax. """
    U = fluid.dygraph.to_variable(np.random.rand(*input.shape))
    # U = layers.uniform_random(input.shape, dtype=input.dtype, min=0.0, max=1.0)
    # U.stop_gradient = True
    gumbel = 0.0 - layers.log(eps - layers.log(U + eps))
    y = input + gumbel
    return layers.softmax(y / tau)
示例#3
0
def gumbel_softmax(logits, tau=0.67, eps=1e-10):
    """Gumbel softmax."""
    u = layers.uniform_random_batch_size_like(
        logits, shape=[-1, logits.shape[1]], min=0.0, max=1.0)
    u.stop_gradient = True
    gumbel = 0.0 - layers.log(eps - layers.log(u + eps))
    y = logits + gumbel
    return layers.softmax(y / tau)
示例#4
0
def create_model(args, config):
    """Create model for given model configuration."""
    logging.info('building model')
    graph_wrapper = GraphWrapper(name="graph",
                                 node_feat=[('atom_type', [None, 1], "int64"),
                                            ('chirality_tag', [None,
                                                               1], "int64")],
                                 edge_feat=[('bond_type', [None, 1], "int64"),
                                            ('bond_direction', [None,
                                                                1], "int64")])

    # NOTE: [num_nodes, num_graphs], bs = num_graphs
    pos_mask = L.data(name='pos_mask',
                      shape=[-1, args.batch_size],
                      dtype='float32')
    neg_mask = L.data(name='neg_mask',
                      shape=[-1, args.batch_size],
                      dtype='float32')

    encoder = GINEncoder(config)
    global_repr, patch_summary = encoder.forward(graph_wrapper)

    global_D = FF(encoder.embedding_dim)
    local_D = FF(encoder.embedding_dim)
    g_enc = global_D.forward(global_repr)
    l_enc = local_D.forward(patch_summary)

    res = L.matmul(l_enc, g_enc, transpose_y=True)
    E_pos = get_positive_expectation(res * pos_mask,
                                     config['measure'],
                                     average=False)
    E_pos = L.reduce_sum(E_pos) / graph_wrapper.num_nodes
    E_neg = get_negative_expectation(res * neg_mask,
                                     config['measure'],
                                     average=False)
    E_neg = L.reduce_sum(E_neg) / (graph_wrapper.num_nodes *
                                   (graph_wrapper.num_graph - 1))
    local_global_loss = E_neg - E_pos

    if config['prior']:
        prior_D = PriorDiscriminator(encoder.embedding_dim)
        prior = L.uniform_random([args.batch_size, encoder.embedding_dim],
                                 min=0.0,
                                 max=1.0)
        term_1 = L.reduce_mean(L.log(prior_D.forward(prior)))
        term_2 = L.reduce_mean(L.log(1.0 - prior_D.forward(global_repr)))
        prior_loss = -(term_1 + term_2) * config['gamma']
    else:
        prior_loss = 0

    total_loss = local_global_loss + prior_loss

    keys = ['loss', 'graph_wrapper', 'encoder', 'graph_emb']
    Agent = namedtuple('Agent', keys)
    return Agent(loss=total_loss,
                 graph_wrapper=graph_wrapper,
                 encoder=encoder,
                 graph_emb=global_repr)
    def _collect_metrics(self, inputs, outputs):
        """ Calculate loss function by using inputs and outputs. """
        metrics = {}

        tgt_len = layers.reduce_sum(
            layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1)
        tgt_len.stop_gradient = True

        label = inputs["tgt_token"][:, 1:]
        if self.label_smooth > 0:
            one_hot_label = layers.one_hot(label, self.num_token_embeddings)
            smooth_label = layers.label_smooth(one_hot_label,
                                               epsilon=self.label_smooth,
                                               dtype=self._dtype)
            nll = layers.cross_entropy(outputs["dec_pred"],
                                       smooth_label,
                                       soft_label=True,
                                       ignore_index=self.padding_idx)
        else:
            nll = layers.cross_entropy(outputs["dec_probs"],
                                       label,
                                       ignore_index=self.padding_idx)
        nll = layers.reduce_sum(nll, dim=1)
        token_nll = layers.reduce_sum(nll) / tgt_len
        nll = layers.reduce_mean(nll)
        metrics["nll"] = nll
        metrics["token_nll"] = token_nll
        loss = nll

        if self.num_latent > 0 and self.with_bow:
            bow_probs = F.unsqueeze(outputs["bow_probs"], [1])
            bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1])
            if self.label_smooth > 0:
                bow = layers.cross_entropy(bow_probs,
                                           smooth_label,
                                           soft_label=True,
                                           ignore_index=self.padding_idx)
            else:
                bow = layers.cross_entropy(bow_probs,
                                           label,
                                           ignore_index=self.padding_idx)
            bow = layers.reduce_sum(bow, dim=1)
            token_bow = layers.reduce_sum(bow) / tgt_len
            bow = layers.reduce_mean(bow)
            metrics["bow"] = bow
            metrics["token_bow"] = token_bow
            loss = loss + bow

        if self.num_latent > 0 and self.use_discriminator:
            dis = 0.0 - (layers.log(outputs["pos_probs"]) +
                         layers.log(1.0 - outputs["neg_probs"]))
            dis = layers.reduce_mean(dis)
            metrics["dis"] = dis
            loss = loss + dis * self.dis_ratio

        metrics["loss"] = loss
        metrics["token_num"] = tgt_len
        return metrics
示例#6
0
def focal_loss(y_predict, y, alpha=0.85, gamma=2, epsilon=1e-6):
    '''
		alpha 变大,对前景类惩罚变大,更加重视
		gamma 变大,对信心大的例子更加忽略,学习难的例子
	'''
    y = fluid.layers.clip(y, epsilon, 1 - epsilon)
    y_predict = fluid.layers.clip(y_predict, epsilon, 1 - epsilon)

    return -1 * (alpha * pow((1 - y_predict), gamma) * y * log(y_predict) +
                 (1 - alpha) * pow(y_predict, gamma) *
                 (1 - y) * log(1 - y_predict))
示例#7
0
def focal_loss(pred, label, alpha=0.25, gamma=2, epsilon=1e-6):
    '''
        alpha 变大,对前景类惩罚变大,更加重视
        gamma 变大,对信心大的例子更加忽略,学习难的例子
    '''
    pred = clip(pred, epsilon, 1 - epsilon)
    label = clip(label, epsilon, 1 - epsilon)
    loss = -1 * (alpha * layers.pow(
        (1 - pred), gamma) * label * layers.log(pred) +
                 (1 - alpha) * layers.pow(pred, gamma) *
                 (1 - label) * log(1 - pred))
    return loss
示例#8
0
    def sigmoid_focal_loss(self, x, label, fg_num, gamma=2.0, alpha=0.25):
        C = x.shape[1]
        eye = paddle.eye(C + 1, dtype='float32')
        one_hot = L.gather(eye, label)
        pos_mask = one_hot[:, 1:]  # 正样本掩码

        p = L.sigmoid(x)  # [批大小*所有格子数, 80], 预测的类别概率
        pos_loss = pos_mask * (0 - L.log(p + 1e-9)) * L.pow(1 - p,
                                                            gamma) * alpha
        neg_loss = (1.0 - pos_mask) * (0 - L.log(1 - p + 1e-9)) * L.pow(
            p, gamma) * (1 - alpha)
        focal_loss = pos_loss + neg_loss
        if fg_num > 0.5:  # 当没有gt时,即fg_num==0时,focal_loss什么都不除。
            focal_loss = focal_loss / fg_num
        return focal_loss
示例#9
0
    def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask):
        """跟进文法约束完成一步解码逻辑

        Args:
            logits (Variable): shape = [batch_size, beam_size, vocab_size]
            next_cell_states (Variable): NULL
            decode_states (StateWrapper): NULL

        Returns: TODO

        Raises: NULL

        """
        # 解码出符合语法规则的 token logits
        logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask)

        # 初始化 vocab size
        self._vocab_size = logits.shape[-1]
        self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1])

        # 计算 log probs,并 mask 掉 finished 部分
        step_log_probs = layers.log(layers.softmax(logits))
        step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished)

        scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size])
        topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size)
        topk_scores = layers.reshape(topk_scores, shape=[-1])
        topk_indices = layers.reshape(topk_indices, shape=[-1])

        # top-k 对应的 beam
        beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor)
        # top-k 对应的 token id
        token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor)

        # 根据 top k 的来源,重新组织 step_log_probs
        next_log_probs = nn_utils.batch_gather(
                layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]),
                topk_indices)
        def _beam_gather(x, beam_indices):
            """reshape x to beam dim, and gather each beam_indices
            Args:
                x (TYPE): NULL
            Returns: Variable
            """
            x = self.split_batch_beams(x)
            return nn_utils.batch_gather(x, beam_indices)
        next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices),
                                                      next_cell_states)
        next_finished = _beam_gather(decode_states.finished, beam_indices)
        next_lens = _beam_gather(decode_states.lengths, beam_indices)

        next_lens = layers.elementwise_add(next_lens,
                layers.cast(layers.logical_not(next_finished), next_lens.dtype))
        next_finished = layers.logical_or(next_finished,
                layers.equal(token_indices, self._end_token_tensor))

        decode_output = OutputWrapper(topk_scores, token_indices, beam_indices)
        decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask)

        return decode_output, decode_states
    def get_embedding(self, num_embeddings,
                      embedding_dim, padding_idx=None):
        """
        Build sinusoidal embeddings.
        This matches the implementation in tensor2tensor,
        but differs slightly from the description
        in Section 3.5 of "Attention Is All You Need".
        """
        half_dim = embedding_dim // 2
        emb = layers.log(float(10000)) / (half_dim - -1)
        emb = layers.exp(layers.arange(
            start=0, end=half_dim, dtype='float32') * -emb)

        # [num_embeddings, embedding_dim // 2]
        emb = layers.unsqueeze(layers.arange(-num_embeddings // 2,
                                             num_embeddings // 2, dtype='float32'), axis=1) *\
            layers.unsqueeze(emb, axis=0)

        emb = layers.concat([layers.sin(emb), layers.cos(emb)], dim=1)
        # [num_embeddings, embedding_dim]
        if embedding_dim % 2 == 1:
            emb = layers.concat(
                [emb, layers.zeros(shape=(num_embeddings, 1))], dim=1)
        if padding_idx is not None:
            emb[paddings_idx, :] = 0
        self.origin_shift = num_embeddings // 2
        return emb
            def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx):
                pre_ids = alive_seq

                dec_step_emb = layers.embedding(
                    input=pre_ids,
                    size=[self.tar_vocab_size, self.hidden_size],
                    dtype='float32',
                    is_sparse=False,
                    param_attr=fluid.ParamAttr(
                        name='target_embedding',
                        initializer=fluid.initializer.UniformInitializer(
                            low=-self.init_scale, high=self.init_scale)))

                dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                    dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array,
                    enc_memory)

                projection = layers.matmul(dec_att_out, softmax_weight)

                logits = layers.softmax(projection)
                current_log = layers.elementwise_add(x=layers.log(logits),
                                                     y=alive_log_prob,
                                                     axis=0)
                base_1 = layers.cast(step_idx, 'float32') + 6.0
                base_1 /= 6.0
                length_penalty = layers.pow(base_1, alpha)

                len_pen = layers.pow(
                    ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha)

                current_log = layers.reshape(current_log, shape=[1, -1])

                current_log = current_log / length_penalty
                topk_scores, topk_indices = layers.topk(input=current_log,
                                                        k=beam_size)

                topk_scores = layers.reshape(topk_scores, shape=[-1])

                topk_log_probs = topk_scores * length_penalty

                generate_id = layers.reshape(topk_indices,
                                             shape=[-1]) % self.tar_vocab_size

                selected_beam = layers.reshape(
                    topk_indices, shape=[-1]) // self.tar_vocab_size

                topk_finished = layers.equal(generate_id, eos_ids)

                topk_finished = layers.cast(topk_finished, 'float32')

                generate_id = layers.reshape(generate_id, shape=[-1, 1])

                pre_tokens_list = layers.gather(tokens, selected_beam)

                full_tokens_list = layers.concat(
                    [pre_tokens_list, generate_id], axis=1)


                return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                        dec_att_out, new_hidden_array, new_cell_array
示例#12
0
    def create_loss_op(self, predict, label, epsilon=1e-7):
        """compute loss with tensor

         Args:
         predict: model output tensor activated by softmax
         label: a non-sparse tensor

         Returns:
         loss: cross-entropy loss
         """
        if self.loss_type == "nl" and self.model_type == "train":
            one_hot_label = fluid.one_hot(label, depth=predict.shape[-1])
            one_hot_label = FL.squeeze(one_hot_label, axes=[-2])
            # log
            neg_prob = 1 - predict
            log_neg_prob = FL.log(
                fluid.layers.clip(neg_prob, min=epsilon, max=1.))
            ce_loss = -1 * log_neg_prob * one_hot_label
            cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True)
        else:  # PL or evaluation
            cost = FL.cross_entropy(predict, label)

        loss = FL.mean(cost)

        return loss
示例#13
0
    def ce_conf_loss(self, pred_allboxes_conf, labels_pos_mask,
                     labels_neg_mask, class_vectors, labels_pos_cid2, gt_area):
        labels_pos_cid2 = P.reshape(labels_pos_cid2,
                                    (-1, ))  # [batch_size*num_priors]
        pred_allboxes_conf_r = P.reshape(
            pred_allboxes_conf, (-1, P.shape(pred_allboxes_conf)[2]
                                 ))  # [batch_size*num_priors, num_classes]
        label_prob = P.gather(
            class_vectors,
            labels_pos_cid2)  # one-hot掩码  (batch_size*num_priors, num_classes)

        pred_prob = P.softmax(pred_allboxes_conf_r)
        pred_prob = P.cast(pred_prob, 'float32')
        prob_loss = label_prob * (0 - P.log(pred_prob + 1e-9))  # 加了极小的常数防止nan
        prob_loss = P.reduce_sum(prob_loss, dim=1)

        # 只留下正反例的损失
        labels_pos_mask2 = P.reshape(labels_pos_mask,
                                     (-1, ))  # [batch_size*num_priors]
        labels_neg_mask2 = P.reshape(labels_neg_mask,
                                     (-1, ))  # [batch_size*num_priors]
        conf_loss_scale = 2.0 - gt_area  # gt面积越小,权重越大,越受重视
        conf_loss_scale = P.reshape(conf_loss_scale,
                                    (-1, ))  # [batch_size*num_priors]
        prob_pos_loss = prob_loss * labels_pos_mask2 * conf_loss_scale
        prob_neg_loss = prob_loss * labels_neg_mask2
        ce_loss = prob_pos_loss + prob_neg_loss
        ce_loss = P.reduce_sum(ce_loss)

        return ce_loss
示例#14
0
        def grow_topk(i, logits, alive_seq, alive_log_probs, states):
            logits = layers.reshape(logits, [batch_size, beam_size, -1])
            candidate_log_probs = layers.log(layers.softmax(logits, axis=2))
            log_probs = layers.elementwise_add(candidate_log_probs,
                                               alive_log_probs, 0)

            length_penalty = np.power(5.0 + (i + 1.0) / 6.0, alpha)
            curr_scores = log_probs / length_penalty
            flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1])

            topk_scores, topk_ids = layers.topk(flat_curr_scores,
                                                k=beam_size * 2)

            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // self.trg_vocab_size
            topk_ids = topk_ids % self.trg_vocab_size

            # use gather as gather_nd, TODO: use gather_nd
            topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index,
                                           beam_size, batch_size)
            topk_seq = layers.concat(
                [topk_seq,
                 layers.reshape(topk_ids, topk_ids.shape + [1])],
                axis=2)
            states = update_states(states, topk_beam_index, beam_size)
            eos = layers.fill_constant(shape=topk_ids.shape,
                                       dtype="int64",
                                       value=eos_id)
            topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32")

            #topk_seq: [batch_size, 2*beam_size, i+1]
            #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size]
            return topk_seq, topk_log_probs, topk_scores, topk_finished, states
示例#15
0
def chunk_softmax(logits, labels, topk=10):
    after_exp = L.exp(logits)
    out, _ = L.argsort(after_exp, axis=-1)
    denorm = L.reduce_sum(out[:, -topk:], dim=-1, keep_dim=True)
    probs = after_exp / denorm
    one_hot = F.one_hot(labels, depth=probs.shape[-1])
    loss = -L.reduce_sum(one_hot * L.log(probs)) / logits.shape[0]
    return loss
示例#16
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    _, vocab_size = logits.shape

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]
    #log.debug(gather_idx.numpy())
    #log.debug(state.finished.numpy())
    #log.debug(next_finished.numpy())

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    #log.debug(next_word_id.numpy())
    #log.debug(next_beam_id.numpy())
    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
示例#17
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    beam_size, vocab_size = logits.shape  # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size
    logits_np = logits.numpy()
    for i in range(beam_size):
        logits_np[i][17963] = 0  # make [UNK] prob = 0
    logits = D.to_variable(logits_np)

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
示例#18
0
 def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches,
               trg_src_attn_bias):
     # gather cell states corresponding to selected parent
     pre_caches = map_structure(
         lambda x: layers.gather(x, index=gather_idx), caches)
     pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                       index=gather_idx)
     pre_pos = layers.elementwise_mul(
         x=layers.fill_constant_batch_size_like(
             input=pre_src_attn_bias,  # cann't use lod tensor here
             value=1,
             shape=[-1, 1],
             dtype=pre_ids.dtype),
         y=step_idx,
         axis=0)
     logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias),
                           trg_vocab_size,
                           max_in_len,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           enc_output=enc_output,
                           caches=pre_caches,
                           bos_idx=bos_idx)
     # intra-beam topK
     topk_scores, topk_indices = layers.topk(
         input=layers.softmax(logits), k=beam_size)
     accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                          y=pre_scores,
                                          axis=0)
     # beam_search op uses lod to differentiate branches.
     accu_scores = layers.lod_reset(accu_scores, pre_ids)
     # topK reduction across beams, also contain special handle of
     # end beams and end sentences(batch reduction)
     selected_ids, selected_scores, gather_idx = layers.beam_search(
         pre_ids=pre_ids,
         pre_scores=pre_scores,
         ids=topk_indices,
         scores=accu_scores,
         beam_size=beam_size,
         end_id=eos_idx,
         return_parent_idx=True)
     step_idx = layers.increment(x=step_idx, value=1.0, in_place=False)
     layers.array_write(selected_ids, i=step_idx, array=ids)
     layers.array_write(selected_scores, i=step_idx, array=scores)
     return (step_idx, selected_ids, selected_scores, gather_idx,
             pre_caches, pre_src_attn_bias)
示例#19
0
文件: loss.py 项目: zuswil/medSeg
def weighed_binary_cross_entropy(y, y_predict, beta=2, epsilon=1e-6):
    """
		返回 wce loss
		beta标记的是希望positive类给到多少的权重,如果positive少,beta给大于1相当与比0的类更重视
	"""
    y = fluid.layers.clip(y, epsilon, 1 - epsilon)
    y_predict = fluid.layers.clip(y_predict, epsilon, 1 - epsilon)

    ylogp = fluid.layers.elementwise_mul(y, log(y_predict))
    betas = fluid.layers.fill_constant(ylogp.shape, "float32", beta)
    ylogp = fluid.layers.elementwise_mul(betas, ylogp)

    ones = fluid.layers.fill_constant(y_predict.shape, "float32", 1)
    ylogp = fluid.layers.elementwise_add(
        ylogp,
        elementwise_mul(elementwise_sub(ones, y),
                        log(elementwise_sub(ones, y_predict))))

    zeros = fluid.layers.fill_constant(y_predict.shape, "float32", 0)
    return fluid.layers.elementwise_sub(zeros, ylogp)
示例#20
0
def _de_sigmoid(x, eps=1e-7):
    # x限制在区间[eps, 1 / eps]内
    x = L.clip(x, eps, 1 / eps)

    # 先取倒数再减一
    x = 1.0 / x - 1.0

    # e^(-x)限制在区间[eps, 1 / eps]内
    x = L.clip(x, eps, 1 / eps)

    # 取对数再取相反数
    x = -L.log(x)
    return x
示例#21
0
    def focal_conf_loss(self,
                        pred_allboxes_conf,
                        labels_pos_mask,
                        labels_neg_mask,
                        class_vectors,
                        labels_pos_cid2,
                        focal_loss_alpha=0.25,
                        focal_loss_gamma=2):
        labels_pos_cid2 = P.reshape(labels_pos_cid2,
                                    (-1, ))  # [batch_size*num_priors]
        pred_allboxes_conf_r = P.reshape(
            pred_allboxes_conf, (-1, P.shape(pred_allboxes_conf)[2]
                                 ))  # [batch_size*num_priors, num_classes]
        label_prob = P.gather(
            class_vectors,
            labels_pos_cid2)  # one-hot掩码  (batch_size*num_priors, num_classes)

        # 我们可以在训练时改为sigmoid激活,预测时依然还是softmax激活。
        # 能这么做的原因是,若某位的sigmoid值最大,那么一定有该位的softmax值最大。
        pred_prob = P.sigmoid(pred_allboxes_conf_r)
        pred_prob = P.cast(pred_prob, 'float32')

        # focal_loss
        labels_pos_mask2 = P.reshape(labels_pos_mask,
                                     (-1, ))  # [batch_size*num_priors]
        labels_neg_mask2 = P.reshape(labels_neg_mask,
                                     (-1, ))  # [batch_size*num_priors]
        prob_pos_loss = label_prob * (
            0 - P.log(pred_prob + 1e-9)) * focal_loss_alpha * (
                1.0 - pred_prob)**focal_loss_gamma
        prob_neg_loss = (1 - label_prob) * (
            0 - P.log(1 - pred_prob + 1e-9)) * (
                1.0 - focal_loss_alpha) * pred_prob**focal_loss_gamma
        focal_loss = prob_pos_loss + prob_neg_loss
        focal_loss = P.reduce_sum(focal_loss, dim=1)
        focal_loss = focal_loss * (labels_pos_mask2 + labels_neg_mask2)
        focal_loss = P.reduce_sum(focal_loss)

        return focal_loss
示例#22
0
def log_sum_exp(x):
    """预测为背景的概率是(axx是神经网络的输出)
    p = e^(a00-max)/[e^(a00-max)+e^(a01-max)+...+e^(a80-max)]
    取对数
    lnp = a00-max-ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)]
    移项
    a00 = lnp + max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)]
    如果真的是背景类,标记p=1,所以
    a00 = max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)]
    神经网络的输出要尽量接近等号右边,才能预测为背景类。
    """
    x_max = P.reduce_max(x)
    return P.log(P.reduce_sum(P.exp(x - x_max), 1)) + x_max
示例#23
0
    def _get_metrics(self, inputs, outputs):
        metrics = super(Plato, self)._get_metrics(inputs, outputs)
        if self.use_bow:
            fc_out = self._calc_bow_logits(outputs["enc_out"], inputs["bow_pos"])
            bow_loss = layers.softmax_with_cross_entropy(
                logits=fc_out, label=inputs["bow_label"])
            mean_bow_loss = layers.mean(bow_loss)
            metrics["bow_loss"] = mean_bow_loss
            metrics["loss"] = metrics["loss"] + mean_bow_loss

        entropy_loss = layers.reduce_sum(outputs["post_probs"] * layers.log(outputs["post_probs"]), dim=1)
        mean_entropy_loss = layers.mean(entropy_loss)
        metrics["entropy_loss"] = mean_entropy_loss
        if self.use_entropy:
            metrics["loss"] = metrics["loss"] + mean_entropy_loss
        return metrics
示例#24
0
    def func(self, place):
        shape = [2, 3, 7, 9]
        eps = 1e-6
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        x.persistable = True
        y = layers.log(x)

        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)

        gradient_checker.double_grad_check([x],
                                           y,
                                           x_init=x_arr,
                                           place=place,
                                           eps=eps)
示例#25
0
    def func(self, place):
        shape = [2, 3, 7, 9]
        eps = 1e-6
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        x.persistable = True
        y = layers.log(x)

        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
        gradient_checker.double_grad_check_for_dygraph(
            self.log_wrapper, [x], y, x_init=x_arr, place=place)
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
    def _posteriori_network(self, input_mask, embed, batch_size, src_len, tgt_len):
        """ Basic posteriori network implement. """
        mask_embed = self.mask_embed
        mask_embed = layers.expand(mask_embed, [batch_size, 1, 1])
        mask_embed = self.embed_layer_norm(mask_embed)
        post_embed = layers.concat([mask_embed, embed], axis=1)

        mask = self._create_mask(input_mask, auto_regressive=not self.bidirectional_context,
                                 append_head=True)

        for layer in self.layers:
            post_embed = layer(post_embed, mask, None)

        post_embed = post_embed[:, 0]
        post_logits = self.post_network(post_embed)
        post_probs = layers.softmax(post_logits, axis=-1)
        post_logits = layers.log(post_probs)
        return post_embed, post_probs, post_logits
示例#27
0
    def pairwise_hinge(self):
        """pairwise model"""
        poi_repr = L.split(self.poi_repr, 2, dim=0)
        pos_repr, neg_repr = poi_repr
        pos_pred = L.cos_sim(self.query_repr, pos_repr)
        neg_pred = L.cos_sim(self.query_repr, neg_repr)

        mode = 'hinge_loss'
        # log(1 + e-z), max(0, 1 - z)
        if 'hinge_loss' == mode:
            theta_z = L.relu(1 + neg_pred - pos_pred)
        elif 'logistic_loss' == mode:
            theta_z = L.log(1 + L.exp(neg_pred - pos_pred))
        self.loss = L.reduce_mean(theta_z)
        pos_cnt = L.reduce_sum(L.cast(L.greater_than(pos_pred, neg_pred), dtype="float32"))
        neg_cnt = L.reduce_sum(L.cast(L.less_than(pos_pred, neg_pred), dtype="float32"))
        self.order = pos_cnt / (1e-5 + neg_cnt)
        self.metrics = [self.loss, self.order]
    def _decode(self, state):
        """ Decoding one time stamp. """
        # shape: [batch_size, 1, seq_len]
        mask = state["mask"]

        # shape: [batch_size, 1]
        pred_token = state["pred_token"]
        pred_mask = state["pred_mask"]
        pred_pos = state["pred_pos"]
        pred_type = state["pred_type"]
        pred_turn = state["pred_turn"]

        # list of shape(len: num_layers): [batch_size, seq_len, hidden_dim]
        cache = state["cache"]

        pred_embed = self.embedder(pred_token, pred_pos, pred_type, pred_turn)
        pred_embed = self.embed_layer_norm(pred_embed)

        # shape: [batch_size, 1, seq_len + 1]
        mask = layers.concat([mask, 1 - pred_mask], axis=2)

        # shape: [batch_size, 1, hidden_dim]
        for l, layer in enumerate(self.layers):
            pred_embed = layer(pred_embed, mask, cache[f"layer_{l}"])

        # shape: [batch_size, 1, vocab_size]
        if self.two_layer_predictor:
            pred_embed = self.pre_predictor(pred_embed)
        if self.weight_sharing:
            token_embedding = self.embedder.token_embedding.weight
            pred_logits = layers.matmul(
                x=pred_embed,
                y=token_embedding,
                transpose_y=True
            )
        else:
            pred_logits = self.predictor(pred_embed)
        pred_logits = pred_logits[: , 0]
        pred_probs = layers.softmax(pred_logits, axis=1)
        pred_logits = layers.log(pred_probs)

        state["mask"] = mask
        return pred_logits, state
    def grow_topk(self, i, logits, alive_seq, alive_log_probs, cache, enc_output, enc_bias):
        """
            grow_topk
        """
        logits = layers.reshape(logits, [self.batch_size, self.beam_size, -1])
        
        candidate_log_probs = layers.log(layers.softmax(logits, axis=2))
        log_probs = candidate_log_probs + layers.unsqueeze(alive_log_probs, axes=[2]) 
        
        base_1 = layers.cast(i, 'float32') + 6.0
        base_1 /= 6.0
        length_penalty = layers.pow(base_1, self.alpha)
        #length_penalty = layers.pow(((5.0 + layers.cast(i+1, 'float32')) / 6.0), self.alpha)
        
        curr_scores = log_probs / length_penalty
        flat_curr_scores = layers.reshape(curr_scores, [self.batch_size, self.beam_size * self.vocab_size])

        topk_scores, topk_ids = layers.topk(flat_curr_scores, k=self.beam_size * 2)
        
        topk_log_probs = topk_scores * length_penalty

        select_beam_index = topk_ids // self.vocab_size
        select_id = topk_ids % self.vocab_size

        #layers.Print(select_id, message="select_id", summarize=1024)
        #layers.Print(topk_scores, message="topk_scores", summarize=10000000)
        
        flat_select_beam_index = layers.reshape(select_beam_index, [-1]) + self.gather_top2k_append_index
        
        topk_seq = layers.gather(alive_seq, [flat_select_beam_index])
        topk_seq = layers.reshape(topk_seq, [self.batch_size, 2 * self.beam_size, -1])
        
        
        #concat with current ids
        topk_seq = layers.concat([topk_seq, layers.unsqueeze(select_id, axes=[2])], axis=2)
        topk_finished = layers.cast(layers.equal(select_id, self.eos_id), 'float32') 
        
        #gather cache
        self.gather_cache(cache, flat_select_beam_index)

        #topk_seq: [batch_size, 2*beam_size, i+1]
        #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size]
        return topk_seq, topk_log_probs, topk_scores, topk_finished, cache
示例#30
0
 def loss_neg_log_of_pos(self, pos_score, neg_score_n, gama=5.0):
     """
         pos_score: batch_size x 1
         neg_score_n: batch_size x n
     """
     # n x batch_size
     neg_score_n = L.transpose(neg_score_n, [1, 0])
     # 1 x batch_size
     pos_score = L.reshape(pos_score, [1, -1])
     exp_pos_score = L.exp(pos_score * gama)
     exp_neg_score_n = L.exp(neg_score_n * gama)
     # (n+1) x batch_size
     pos_neg_score = L.concat([exp_pos_score, exp_neg_score_n], axis=0)
     # 1 x batch_size
     exp_sum = L.reduce_sum(pos_neg_score, dim=0, keep_dim=True)
     # 1 x batch_size
     loss = -1.0 * L.log(exp_pos_score / exp_sum)
     # batch_size
     loss = L.reshape(loss, [-1, 1])
     return loss