Exemplo n.º 1
0
    def check_switch(self, value):
        x = layers.fill_constant(shape=[1], dtype='float32', value=value)
        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)

        result = layers.create_global_var(shape=[1],
                                          value=-1.0,
                                          dtype='float32',
                                          persistable=True)

        with layers.Switch() as switch:
            with switch.case(layers.less_than(x, zero_var)):
                layers.assign(zero_var, result)
            with switch.case(layers.less_than(x, one_var)):
                layers.assign(one_var, result)
            with switch.case(layers.less_than(x, two_var)):
                layers.assign(two_var, result)
            with switch.default():
                layers.assign(three_var, result)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        exe.run(default_startup_program())

        out = exe.run(feed={}, fetch_list=[result])[0][0]
        return out
Exemplo n.º 2
0
 def increment(self):
     enough_steps = layers.less_than(self.increment_every,
                                     self.good_steps + 1)
     with layers.Switch() as switch:
         with switch.case(enough_steps):
             new_scale = self.scale * self.factor
             scale_valid = layers.isfinite(new_scale)
             with layers.Switch() as switch2:
                 with switch2.case(scale_valid):
                     layers.assign(new_scale, self.scale)
                     layers.assign(layers.zeros_like(self.good_steps),
                                   self.good_steps)
                 with switch2.default():
                     layers.increment(self.good_steps)
         with switch.default():
             layers.increment(self.good_steps)
Exemplo n.º 3
0
    def decrement(self):
        new_scale = self.scale / self.factor
        one = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
        less_than_one = layers.less_than(new_scale, one)
        with layers.Switch() as switch:
            with switch.case(less_than_one):
                layers.assign(one, self.scale)
            with switch.default():
                layers.assign(new_scale, self.scale)

        layers.assign(layers.zeros_like(self.good_steps), self.good_steps)
Exemplo n.º 4
0
def update_loss_scale(grads):
    state = mixed_precision_global_state()
    if state is None or not state.dynamic_scaling:
        return
    per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads])
    grad_valid = layers.isfinite(per_grad_check)

    with layers.Switch() as switch:
        with switch.case(grad_valid):
            state.increment()
        with switch.default():
            state.decrement()
    return grad_valid
Exemplo n.º 5
0
def backward(self, loss, **kwargs):
    state = mixed_precision_global_state()
    callbacks = 'callbacks' in kwargs and kwargs['callbacks'] or None
    if callbacks is None:
        from paddle.fluid.clip import error_clip_callback
        callbacks = [error_clip_callback]  # XXX what if gradient is zero?
    if state is not None:
        kwargs['callbacks'] = [scale_gradient] + callbacks
    else:
        kwargs['callbacks'] = callbacks
    param_grads = self._backward(loss, **kwargs)
    if state is not None:
        grad_valid = update_loss_scale(v for k, v in param_grads)
        if state.dynamic_scaling:
            with layers.Switch() as switch:
                with switch.case(grad_valid):
                    pass
                with switch.default():
                    for _, g in param_grads:
                        layers.assign(layers.zeros_like(g), g)

    return param_grads
Exemplo n.º 6
0
 def test_condition_dtype():
     with layers.Switch() as switch:
         with switch.case(cond):
             layers.assign(zero_var, result)
Exemplo n.º 7
0
def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
                        num_bad_steps, incr_every_n_steps,
                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
    """
    Update loss scaling according to overall gradients. If all gradients is
    finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
    Otherwise, loss scaling will decrease by decr_ratio after
    decr_every_n_nan_or_inf steps and each step some gradients are infinite.

    Args:
        is_overall_finite (Variable): A boolean variable indicates whether
                                     all gradients are finite.
        prev_loss_scaling (Variable): Previous loss scaling.
        num_good_steps (Variable): A variable accumulates good steps in which
                                   all gradients are finite.
        num_bad_steps (Variable): A variable accumulates bad steps in which
                                  some gradients are infinite.
        incr_every_n_steps (Variable): A variable represents increasing loss
                                       scaling every n consecutive steps with
                                       finite gradients.
        decr_every_n_nan_or_inf (Variable): A variable represents decreasing
                                            loss scaling every n accumulated
                                            steps with nan or inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
                           loss scaling.
    """
    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
    with layers.Switch() as switch:
        with switch.case(is_overall_finite):
            should_incr_loss_scaling = layers.less_than(
                incr_every_n_steps, num_good_steps + 1)
            with layers.Switch() as switch1:
                with switch1.case(should_incr_loss_scaling):
                    new_loss_scaling = prev_loss_scaling * incr_ratio
                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
                    with layers.Switch() as switch2:
                        with switch2.case(loss_scaling_is_finite):
                            layers.assign(new_loss_scaling, prev_loss_scaling)
                        with switch2.default():
                            pass
                    layers.assign(zero_steps, num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)

                with switch1.default():
                    layers.increment(num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)

        with switch.default():
            should_decr_loss_scaling = layers.less_than(
                decr_every_n_nan_or_inf, num_bad_steps + 1)
            with layers.Switch() as switch3:
                with switch3.case(should_decr_loss_scaling):
                    new_loss_scaling = prev_loss_scaling * decr_ratio
                    static_loss_scaling = \
                        layers.fill_constant(shape=[1],
                                             dtype='float32',
                                             value=1.0)
                    less_than_one = layers.less_than(new_loss_scaling,
                                                     static_loss_scaling)
                    with layers.Switch() as switch4:
                        with switch4.case(less_than_one):
                            layers.assign(static_loss_scaling,
                                          prev_loss_scaling)
                        with switch4.default():
                            layers.assign(new_loss_scaling, prev_loss_scaling)
                    layers.assign(zero_steps, num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)
                with switch3.default():
                    layers.assign(zero_steps, num_good_steps)
                    layers.increment(num_bad_steps)
def decode(context):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(name="init_scores",
                          shape=[1],
                          dtype="float32",
                          lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the lod of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(input=pre_ids,
                                   size=[dict_size, word_dim],
                                   dtype='float32',
                                   is_sparse=is_sparse,
                                   param_attr=fluid.ParamAttr(name='vemb'))

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
        # calculate accumulated scores after topk to reduce computation cost
        accu_scores = pd.elementwise_add(x=pd.log(topk_scores),
                                         y=pd.reshape(pre_score, shape=[-1]),
                                         axis=0)
        selected_ids, selected_scores = pd.beam_search(pre_ids,
                                                       pre_score,
                                                       topk_indices,
                                                       accu_scores,
                                                       beam_size,
                                                       end_id=10,
                                                       level=0)

        with pd.Switch() as switch:
            with switch.case(pd.is_empty(selected_ids)):
                pd.fill_constant(shape=[1],
                                 value=0,
                                 dtype='bool',
                                 force_cpu=True,
                                 out=cond)
            with switch.default():
                pd.increment(x=counter, value=1, in_place=True)

                # update the memories
                pd.array_write(current_state, array=state_array, i=counter)
                pd.array_write(selected_ids, array=ids_array, i=counter)
                pd.array_write(selected_scores, array=scores_array, i=counter)

                # update the break condition: up to the max length or all candidates of
                # source sentences have ended.
                length_cond = pd.less_than(x=counter, y=array_len)
                finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
                pd.logical_and(x=length_cond, y=finish_cond, out=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

    return translation_ids, translation_scores
Exemplo n.º 9
0
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 callbacks=None):
        assert loss._get_info('shard_logit')

        shard_logit = loss._get_info('shard_logit')
        shard_prob = loss._get_info('shard_prob')
        shard_label = loss._get_info('shard_label')
        shard_dim = loss._get_info('shard_dim')

        op_maker = fluid.core.op_proto_and_checker_maker
        op_role_key = op_maker.kOpRoleAttrName()
        op_role_var_key = op_maker.kOpRoleVarAttrName()
        backward_role = int(op_maker.OpRole.Backward)
        loss_backward_role = int(op_maker.OpRole.Loss) | int(
            op_maker.OpRole.Backward)

        # minimize a scalar of reduce_sum to generate the backward network
        scalar = fluid.layers.reduce_sum(shard_logit)
        block = loss.block

        if not self._use_fp16:
            ret = self._optimizer.minimize(scalar)

            # remove the unnecessary ops
            index = 0
            for i, op in enumerate(block.ops):
                if op.all_attrs()[op_role_key] == loss_backward_role:
                    index = i
                    break

            assert block.ops[index - 1].type == 'reduce_sum'
            assert block.ops[index].type == 'fill_constant'
            assert block.ops[index + 1].type == 'reduce_sum_grad'
            block._remove_op(index + 1)
            block._remove_op(index)
            block._remove_op(index - 1)

            self.insert_commom_backward_op(block, index, shard_logit,
                                           shard_prob, shard_label, shard_dim,
                                           op_role_key, backward_role,
                                           loss_backward_role)
            return ret
        else:
            scaled_params_grads = self.fp16_backward(block, scalar,
                                                     startup_program,
                                                     parameter_list,
                                                     no_grad_set, callbacks)
            index = 0
            for i, op in enumerate(block.ops):
                if op.all_attrs()[op_role_key] == loss_backward_role:
                    index = i
                    break

            if self._loss_type == 'dist_arcface':
                assert block.ops[index - 2].type == 'fill_constant'
                assert block.ops[index - 1].type == 'reduce_sum'
                assert block.ops[index].type == 'fill_constant'
                assert block.ops[index + 1].type == 'reduce_sum_grad'
                assert block.ops[index + 2].type == 'scale'
                assert block.ops[index + 3].type == 'elementwise_add_grad'

                block._remove_op(index + 2)
                block._remove_op(index + 1)
                block._remove_op(index)
                block._remove_op(index - 1)

                self.insert_dist_arcface_backward_op(block, index, shard_logit,
                                                     shard_prob, shard_label,
                                                     shard_dim, op_role_key,
                                                     backward_role,
                                                     loss_backward_role)

            elif self._loss_type == 'dist_softmax':
                assert block.ops[index - 1].type == 'reduce_sum'
                assert block.ops[index].type == 'fill_constant'
                assert block.ops[index + 1].type == 'reduce_sum_grad'
                assert block.ops[index + 2].type == 'cast'
                assert block.ops[index + 3].type == 'elementwise_add_grad'

                block._remove_op(index + 1)
                block._remove_op(index)
                block._remove_op(index - 1)

                self.insert_dist_softmax_backward_op(block, index, shard_logit,
                                                     shard_prob, shard_label,
                                                     shard_dim, op_role_key,
                                                     backward_role,
                                                     loss_backward_role)

            if self._use_dynamic_loss_scaling:
                grads = [
                    layers.reduce_sum(g) for [_, g] in scaled_params_grads
                ]
                all_grads = layers.concat(grads)
                all_grads_sum = layers.reduce_sum(all_grads)
                is_overall_finite = layers.isfinite(all_grads_sum)

                update_loss_scaling(is_overall_finite, self._loss_scaling,
                                    self._num_good_steps, self._num_bad_steps,
                                    self._incr_every_n_steps,
                                    self._decr_every_n_nan_or_inf,
                                    self._incr_ratio, self._decr_ratio)

                with layers.Switch() as switch:
                    with switch.case(is_overall_finite):
                        pass
                    with switch.default():
                        for _, g in scaled_params_grads:
                            layers.assign(layers.zeros_like(g), g)

            optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
            ret = optimize_ops, scaled_params_grads
            return ret