Пример #1
0
def _test_read_write(x):
    i = layers.zeros(shape=[1], dtype='int64')
    i.stop_gradient = False
    arr = layers.array_write(x=x[0], i=i)
    i = layers.increment(x=i)
    arr = layers.array_write(x=x[1], i=i, array=arr)
    i = layers.increment(x=i)
    arr = layers.array_write(x=x[2], i=i, array=arr)

    i = layers.zeros(shape=[1], dtype='int64')
    i.stop_gradient = False
    a0 = layers.array_read(array=arr, i=i)
    i = layers.increment(x=i)
    a1 = layers.array_read(array=arr, i=i)
    i = layers.increment(x=i)
    a2 = layers.array_read(array=arr, i=i)

    mean_a0 = layers.mean(a0)
    mean_a1 = layers.mean(a1)
    mean_a2 = layers.mean(a2)

    a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

    mean_x0 = layers.mean(x[0])
    mean_x1 = layers.mean(x[1])
    mean_x2 = layers.mean(x[2])

    x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

    return a_sum, x_sum
Пример #2
0
    def test_mul(self):
        i = zeros(shape=[1], dtype='int64')
        a = data(name='a', shape=[784], dtype='float32')
        array = array_write(x=a, i=i)

        i = increment(i)
        b = data(
            name='b',
            shape=[784, 100],
            dtype='float32',
            append_batch_size=False)
        array_write(x=b, i=i, array=array)

        i = increment(i)
        out = mul(x=a, y=b)
        array_write(x=out, i=i, array=array)

        a_np = numpy.random.random((100, 784)).astype('float32')
        b_np = numpy.random.random((784, 100)).astype('float32')

        exe = Executor()
        res, res_array = exe.run(feed={'a': a_np,
                                       'b': b_np},
                                 fetch_list=[out, array])

        self.assertEqual((100, 100), res.shape)
        self.assertTrue(numpy.allclose(res, numpy.dot(a_np, b_np)))
        self.assertTrue(numpy.allclose(res_array[0], a_np))
        self.assertTrue(numpy.allclose(res_array[1], b_np))
        self.assertTrue(numpy.allclose(res_array[2], res))
Пример #3
0
    def simple_net(self):
        d0 = layers.data(
            "d0", shape=[10], append_batch_size=False, dtype='float32')
        d1 = layers.data(
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
        # fill_constant npu op doesn't support int64
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)
        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)
        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        array_len = layers.fill_constant(shape=[1], dtype='int32', value=5)
        array_len = layers.cast(array_len, 'int64')
        array_len.stop_gradient = True
        cond = layers.ones(shape=[1], dtype='int32')
        cond = layers.cast(cond, 'bool')
        j = layers.fill_constant(shape=[1], dtype='int32', value=1)
        j = layers.cast(j, 'int64')
        j.stop_gradient = True
        array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3)
        array_len2 = layers.cast(array_len2, 'int64')
        array_len2.stop_gradient = True
        cond2 = layers.logical_or(x=j, y=array_len2)
        cond2 = layers.ones(shape=[1], dtype='int32')
        cond2 = layers.cast(cond2, 'bool')
        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)
        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)
        return loss, sum_result
Пример #4
0
def _get_gm_cond_var(main_program, k_steps):
    main_block = main_program.global_block()
    # Add const var
    k_step_var = layers.create_global_var(name="gradient_merge_k",
                                          shape=[1],
                                          value=int(k_steps),
                                          dtype='int32',
                                          persistable=True,
                                          force_cpu=True)

    zero_var = layers.create_global_var(name="gradient_merge_zero",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)

    # Add step var & cond var
    step_var = layers.create_global_var(name="gradient_merge_step",
                                        shape=[1],
                                        value=int(0),
                                        dtype='int32',
                                        persistable=True,
                                        force_cpu=True)

    cond_var = layers.create_global_var(name="gradient_merge_cond",
                                        shape=[1],
                                        value=bool(0),
                                        dtype='bool',
                                        persistable=False,
                                        force_cpu=True)

    with device_guard("cpu"):
        # step_var = (step_var + 1) % k_step
        layers.increment(x=step_var, value=1.0, in_place=True)
        main_block.append_op(type='elementwise_mod',
                             inputs={
                                 'X': step_var,
                                 'Y': k_step_var
                             },
                             outputs={'Out': step_var},
                             attrs={
                                 'axis': -1,
                                 'use_mkldnn': False
                             })

        # cond_var = (step_var == 0)
        main_block.append_op(type='equal',
                             inputs={
                                 'X': step_var,
                                 'Y': zero_var
                             },
                             outputs={'Out': cond_var})

    return cond_var
Пример #5
0
    def test_simple_forward(self):
        d0 = layers.data(
            "d0", shape=[10], append_batch_size=False, dtype='float32')
        d1 = layers.data(
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        while_op = layers.While(cond=cond)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

        sum_result = layers.array_read(array=mem_array, i=i)
        loss = layers.mean(sum_result)

        append_backward(loss)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        d = []

        for i in xrange(3):
            d.append(numpy.random.random(size=[10]).astype('float32'))

        outs = exe.run(feed={'d0': d[0],
                             'd1': d[1],
                             'd2': d[2]},
                       fetch_list=[sum_result])
        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
Пример #6
0
 def increment(self):
     enough_steps = layers.less_than(self.increment_every,
                                     self.good_steps + 1)
     with layers.Switch() as switch:
         with switch.case(enough_steps):
             new_scale = self.scale * self.factor
             scale_valid = layers.isfinite(new_scale)
             with layers.Switch() as switch2:
                 with switch2.case(scale_valid):
                     layers.assign(new_scale, self.scale)
                     layers.assign(layers.zeros_like(self.good_steps),
                                   self.good_steps)
                 with switch2.default():
                     layers.increment(self.good_steps)
         with switch.default():
             layers.increment(self.good_steps)
Пример #7
0
 def internal_body(j, x, mem_array):
     inner_data = layers.array_read(array=data_array, i=j)
     inner_prev = layers.array_read(array=mem_array, i=j)
     inner_sum_0 = layers.elementwise_add(x=inner_data, y=inner_prev)
     inner_sum_1 = layers.elementwise_add(x=x, y=inner_sum_0)
     j = layers.increment(x=j, in_place=True)
     layers.array_write(inner_sum_1, i=j, array=mem_array)
     return [j, x, mem_array]
    def build_program(self, main_program, startup_program):
        with fluid.unique_name.guard():
            with fluid.program_guard(main_program, startup_program):
                i = layers.zeros(shape=[1], dtype='int64')
                img = fluid.data(name='image', shape=[-1, 784], dtype='float32')
                label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
                loss = simple_fc_net_with_inputs(img, label, class_num=10)
                loss = simple_fc_net()
                opt = fluid.optimizer.SGD(learning_rate=0.001)
                opt.minimize(loss)

                array = layers.array_write(x=img, i=i)
                i = layers.increment(i)
                layers.array_write(x=label, i=i, array=array)
                i = layers.increment(i)
                layers.array_write(x=loss, i=i, array=array)

                return loss, array
Пример #9
0
 def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches,
               trg_src_attn_bias):
     # gather cell states corresponding to selected parent
     pre_caches = map_structure(
         lambda x: layers.gather(x, index=gather_idx), caches)
     pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                       index=gather_idx)
     pre_pos = layers.elementwise_mul(
         x=layers.fill_constant_batch_size_like(
             input=pre_src_attn_bias,  # cann't use lod tensor here
             value=1,
             shape=[-1, 1],
             dtype=pre_ids.dtype),
         y=step_idx,
         axis=0)
     logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias),
                           trg_vocab_size,
                           max_in_len,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           enc_output=enc_output,
                           caches=pre_caches,
                           bos_idx=bos_idx)
     # intra-beam topK
     topk_scores, topk_indices = layers.topk(
         input=layers.softmax(logits), k=beam_size)
     accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                          y=pre_scores,
                                          axis=0)
     # beam_search op uses lod to differentiate branches.
     accu_scores = layers.lod_reset(accu_scores, pre_ids)
     # topK reduction across beams, also contain special handle of
     # end beams and end sentences(batch reduction)
     selected_ids, selected_scores, gather_idx = layers.beam_search(
         pre_ids=pre_ids,
         pre_scores=pre_scores,
         ids=topk_indices,
         scores=accu_scores,
         beam_size=beam_size,
         end_id=eos_idx,
         return_parent_idx=True)
     step_idx = layers.increment(x=step_idx, value=1.0, in_place=False)
     layers.array_write(selected_ids, i=step_idx, array=ids)
     layers.array_write(selected_scores, i=step_idx, array=scores)
     return (step_idx, selected_ids, selected_scores, gather_idx,
             pre_caches, pre_src_attn_bias)
Пример #10
0
 def setUp(self):
     self.main_program = Program()
     switch_main_program(self.main_program)
     x = layers.data('x', shape=[100], dtype='float32')
     x.stop_gradient = False
     rank_table_tensor = layers.data(
         'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
     table = layers.lod_rank_table(x=rank_table_tensor)
     i = layers.zeros(dtype='int64', shape=[1])
     self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
     i = layers.increment(x=i)
     i.stop_gradient = True
     self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
     i = layers.increment(x=i)
     i.stop_gradient = True
     self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
     mem3_mean = layers.mean(self.mem3)
     append_backward(loss=mem3_mean)
     self.x_grad = self.main_program.global_block().var('x@GRAD')
Пример #11
0
 def setUp(self):
     self.main_program = Program()
     switch_main_program(self.main_program)
     x = layers.data('x', shape=[100], dtype='float32')
     x.stop_gradient = False
     rank_table_tensor = layers.data(
         'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
     table = layers.lod_rank_table(x=rank_table_tensor)
     i = layers.zeros(dtype='int64', shape=[1])
     self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
     i = layers.increment(x=i)
     i.stop_gradient = True
     self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
     i = layers.increment(x=i)
     i.stop_gradient = True
     self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
     mem3_mean = layers.mean(self.mem3)
     append_backward(loss=mem3_mean)
     self.x_grad = self.main_program.global_block().var('x@GRAD')
Пример #12
0
 def body(j, x):
     # TODO: In while block, if the var created in parent block
     # participates in the calculation of gradient, the result of gradient
     # is incorrect because each step scope always returns the same value
     # generated by last step.
     # Here we call `assign` op in while block to avoid this bug, and working on fixing it in next PR.
     i = layers.assign(j)
     x = layers.elementwise_mul(x=i, y=i)
     j = layers.increment(j)
     return [j, x]
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed(2)

    x = layers.assign(
        np.random.rand(batch_size, beam_size, 32).astype("float32"))
    indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices")
    step_idx = layers.fill_constant(
        shape=[1], dtype="int64", value=0, force_cpu=True)
    max_len = layers.fill_constant(
        shape=[1], dtype="int64", value=10, force_cpu=True)
    cond = layers.less_than(x=step_idx, y=max_len)
    while_op = layers.While(cond)
    scores = layers.array_write(x, step_idx)
    with while_op.block():
        bs = layers.cast(layers.shape(x)[0], "int64")
        for _ in range(20):
            bs = layers.cast(bs, 'int64')
        bs.stop_gradient = stop_gradient
        batch_pos = layers.expand(
            layers.unsqueeze(
                layers.range(
                    0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size])
        topk_coordinates = layers.stack([batch_pos, indices], axis=2)
        topk_coordinates.stop_gradient = stop_gradient
        score = layers.gather_nd(x, topk_coordinates)
        layers.increment(x=step_idx, value=1.0, in_place=True)
        layers.array_write(score, i=step_idx, array=scores)
        length_cond = layers.less_than(x=step_idx, y=max_len)
        layers.assign(length_cond, cond)

    out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
    loss = layers.reduce_mean(out)
    opt = fluid.optimizer.Adam(0.01)
    opt.minimize(loss)
    exe = fluid.Executor(place)
    data = np.random.random_integers(
        low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64")
    loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss])

    return loss_val
Пример #14
0
        def body(i, ten, test_dict, test_list, test_list_dict):
            test_dict["test_key"] = i
            test_dict["test_key"] += 1

            test_list[0] = fluid.layers.reshape(test_list[0], [2, -1]) + 1

            test_list_dict[0]["test_key"] += 1
            test_list_dict[0]["test_key"] = fluid.layers.relu(
                test_list_dict[0]["test_key"])

            i = layers.increment(i)
            return [i, ten, test_dict, test_list, test_list_dict]
Пример #15
0
        def external_body(i, j, init, sums):
            def internal_cond(j, init, sums):
                return layers.less_than(j, loop_len2)

            def internal_body(j, init, sums):
                init = layers.elementwise_add(x=init, y=ones)
                sums = layers.elementwise_add(x=init, y=sums)
                j = layers.increment(j)
                return [j, init, sums]

            result = layers.while_loop(internal_cond, internal_body,
                                       [j, init, sums])
            j = result[0]
            init = result[1]
            sums = result[2]
            sums = layers.elementwise_add(x=init, y=sums)
            i = layers.increment(i)
            return [i, j, init, sums]
Пример #16
0
        def external_body(i, j, x, mem_array):
            def internal_cond(j, x, mem_array):
                return layers.less_than(j, array_len2)

            def internal_body(j, x, mem_array):
                inner_data = layers.array_read(array=data_array, i=j)
                inner_prev = layers.array_read(array=mem_array, i=j)
                inner_sum_0 = layers.elementwise_add(x=inner_data, y=inner_prev)
                inner_sum_1 = layers.elementwise_add(x=x, y=inner_sum_0)
                j = layers.increment(x=j, in_place=True)
                layers.array_write(inner_sum_1, i=j, array=mem_array)
                return [j, x, mem_array]

            outer_data = layers.array_read(array=data_array, i=i)
            outer_prev = layers.array_read(array=mem_array, i=i)
            outer_sum_0 = layers.elementwise_add(x=outer_data, y=outer_prev)
            outer_sum_1 = layers.elementwise_add(x=x, y=outer_sum_0)
            i = layers.increment(x=i, in_place=True)
            layers.array_write(outer_sum_1, i=i, array=mem_array)
            j, x, mem_array = layers.while_loop(internal_cond, internal_body,
                                                [j, x, mem_array])
            return [i, j, x, mem_array]
Пример #17
0
    def increment(cls, x, value, in_place=False):
        """increment each element in x by value

        Args:
            x (Variable): NULL
            value (int/float): NULL
            in_place (TYPE): Default is False

        Returns: TODO

        Raises: NULL
        """
        if len(x.shape) == 1 and x.shape[0] == 1:
            return layers.increment(x, value, in_place)

        value_tensor = layers.fill_constant(shape=[1], dtype=x.dtype, value=value)
        y = layers.elementwise_add(x, value_tensor)
        if in_place:
            y = layers.assign(y, x)
            return x
        else:
            return y
Пример #18
0
    def beam_search():
        max_len = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(start_tokens, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps to reduce redundant
        # computation in decoder.
        caches = [{
            "k": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # sequence_expand can gather sequences according to lod thus can be
            # used in beam search to sift states corresponding to selected ids.
            pre_src_attn_bias = layers.sequence_expand(
                x=trg_src_attn_bias, y=pre_scores)
            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
            pre_caches = [{
                "k": layers.sequence_expand(
                    x=cache["k"], y=pre_scores),
                "v": layers.sequence_expand(
                    x=cache["v"], y=pre_scores),
            } for cache in caches]
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
                    value=1,
                    shape=[-1, 1],
                    dtype=pre_ids.dtype),
                y=layers.increment(
                    x=step_idx, value=1.0, in_place=False),
                axis=0)
            logits = wrap_decoder(
                trg_vocab_size,
                max_in_len,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                dropout_rate,
                weight_sharing,
                dec_inputs=(
                    pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
                    slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
                enc_output=pre_enc_output,
                caches=pre_caches)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(
                x=layers.log(topk_scores),
                y=layers.reshape(
                    pre_scores, shape=[-1]),
                axis=0)
            # beam_search op uses lod to distinguish branches.
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # update states
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            layers.assign(pre_enc_output, enc_output)
            for i in range(n_layer):
                layers.assign(pre_caches[i]["k"], caches[i]["k"])
                layers.assign(pre_caches[i]["v"], caches[i]["v"])
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_pre_softmax_shape,
                    y=attn_pre_softmax_shape_delta),
                slf_attn_pre_softmax_shape)
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_post_softmax_shape,
                    y=attn_post_softmax_shape_delta),
                slf_attn_post_softmax_shape)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Пример #19
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
        min_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
        step_idx = layers.fill_constant(
            shape=[1], dtype="int64", value=0, force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                    input=pre_ids,
                    value=1.0,
                    shape=[-1, 1, 1],
                    dtype=dtype)
            tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                    input=pre_mask,
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0)

            if self.use_role:
                pre_role = layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=0,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype)
            else:
                pre_role = None

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                role_ids=pre_role,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)
            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)
            def no_penalty():
                """No penalty."""
                return logits
            logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(
                    input=probs, k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs,
                            layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1]
                )
                sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(
                    input=sampling_scores, k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores, axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Пример #20
0
    def test_simple_forward(self):
        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)

        append_backward(loss)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        d = []

        for i in range(3):
            d.append(numpy.random.random(size=[10]).astype('float32'))

        outs = exe.run(feed={
            'd0': d[0],
            'd1': d[1],
            'd2': d[2]
        },
                       fetch_list=[sum_result])
        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
Пример #21
0
    def test_read_write(self):
        x = [
            layers.data(name='x0', shape=[100]),
            layers.data(name='x1', shape=[100]),
            layers.data(name='x2', shape=[100])
        ]

        for each_x in x:
            each_x.stop_gradient = False

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        arr = layers.array_write(x=x[0], i=i)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[1], i=i, array=arr)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[2], i=i, array=arr)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        a0 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a1 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a2 = layers.array_read(array=arr, i=i)

        mean_a0 = layers.mean(a0)
        mean_a1 = layers.mean(a1)
        mean_a2 = layers.mean(a2)

        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

        mean_x0 = layers.mean(x[0])
        mean_x1 = layers.mean(x[1])
        mean_x2 = layers.mean(x[2])

        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

        scope = core.Scope()
        cpu = core.CPUPlace()

        exe = Executor(cpu)

        tensor = numpy.random.random(size=(100, 100)).astype('float32')

        outs = exe.run(feed={
            'x0': tensor,
            'x1': tensor,
            'x2': tensor
        },
                       fetch_list=[a_sum, x_sum],
                       scope=scope)
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum() for item in exe.run(feed={
                'x0': tensor,
                'x1': tensor,
                'x2': tensor
            },
                                           fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
Пример #22
0
    def infilling_decode(self):
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, self.max_seq_len, 1],
                                 [-1, self.max_seq_len, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_ternsor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=50,
                                                      iterable=False)

        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        ernie = ErnieModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.ernie_config,
                           use_fp16=self.use_fp16,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        pos_idx = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=1,
                                       force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)),
                                        step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)
            tmp_mask = layers.array_read(tgt_masks, i=step_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2)
            pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2)
            cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            cur_ids = gen_batch_like(self.attn_id)
            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            cur_pos = gen_batch_like(pos_idx, is_scalar=False)
            if self.continuous_position:
                pre_pos = pre_pos + pos_bias
                cur_pos = cur_pos + pos_bias

            dec_emb_ids = {
                "word_embedding": layers.concat([pre_ids, cur_ids], axis=1),
                "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1)
            }
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = layers.concat(
                    [role_ids, role_ids], axis=1)
                dec_emb_ids["turn_embedding"] = layers.concat(
                    [turn_ids, turn_ids], axis=1)
            else:
                sent_ids = gen_batch_like(self.tgt_type_id)
                dec_emb_ids["sent_embedding"] = layers.concat(
                    [sent_ids, sent_ids], axis=1)
            dec_mask = layers.concat([pre_mask, cur_mask], axis=1)

            dec_out = ernie.encode(dec_emb_ids,
                                   dec_mask,
                                   parent_idx,
                                   remove_query=True)
            fc_out = self.cal_logit(dec_out[:, 1:, :], None)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(fc_out), k=self.beam_size)
            pre_lenpen = layers.pow(
                (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            cur_lenpen = layers.pow(
                (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores * pre_lenpen,
                                                 axis=0) / cur_lenpen
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=self.beam_size,
                end_id=self.eos_idx,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=pos_idx, value=1.0, in_place=True)
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(tmp_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars
Пример #23
0
    def beam_search(enc_output, enc_bias, source_length):
        """
            beam_search
        """
        max_len = layers.fill_constant(
            shape=[1], dtype='int64', value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype='int64', value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        caches_batch_size = batch_size * beam_size
        init_score = np.zeros([1, beam_size]).astype('float32')
        init_score[:, 1:] = -INF
        initial_log_probs = layers.assign(init_score)

        alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1])
        # alive seq [batch_size, beam_size, 1]
        initial_ids = layers.zeros([batch_size, 1, 1], 'float32')
        alive_seq = layers.expand(initial_ids, [1, beam_size, 1]) 
        alive_seq = layers.cast(alive_seq, 'int64')

        enc_output = layers.unsqueeze(enc_output, axes=[1])
        enc_output = layers.expand(enc_output, [1, beam_size, 1, 1])
        enc_output = layers.reshape(enc_output, [caches_batch_size, -1, d_model])

        tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1])
        tgt_src_attn_bias = layers.expand(tgt_src_attn_bias, [1, beam_size, n_head, 1, 1]) 
        enc_bias_shape = layers.shape(tgt_src_attn_bias)
        tgt_src_attn_bias = layers.reshape(tgt_src_attn_bias, [-1, enc_bias_shape[2], 
                enc_bias_shape[3], enc_bias_shape[4]])
            
        beam_search = BeamSearch(beam_size, batch_size, decode_alpha, trg_vocab_size, d_model)

        caches = [{
            "k": layers.fill_constant(
                shape=[caches_batch_size, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant(
                shape=[caches_batch_size, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        
        finished_seq = layers.zeros_like(alive_seq)
        finished_scores = layers.fill_constant([batch_size, beam_size], 
                                                dtype='float32', value=-INF)
        finished_flags = layers.fill_constant([batch_size, beam_size], 
                                                dtype='float32', value=0)
        
        with while_op.block():
            pos = layers.fill_constant([caches_batch_size, 1, 1], dtype='int64', value=1)
            pos = layers.elementwise_mul(pos, step_idx, axis=0)

            alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1])
            alive_seq_2 = alive_seq_1[:, -1:] 
            alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1])
 
            logits = wrap_decoder(
                trg_vocab_size, max_in_len, n_layer, n_head, d_key,
                d_value, d_model, d_inner_hid, prepostprocess_dropout,
                attention_dropout, relu_dropout, preprocess_cmd,
                postprocess_cmd, weight_sharing, embedding_sharing,
                dec_inputs=(alive_seq_2, alive_seq_2, pos, None, tgt_src_attn_bias),
                enc_output=enc_output, caches=caches, is_train=False, params_type=params_type)

            alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \
                    beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq, 
                                           finished_scores, finished_flags, caches, enc_output, 
                                           tgt_src_attn_bias)
            
            layers.increment(x=step_idx, value=1.0, in_place=True)
            finish_cond = beam_search.is_finished(step_idx, source_length, alive_log_probs_2, 
                                                  finished_scores_2, finished_flags_2) 

            layers.assign(alive_seq_2, alive_seq)
            layers.assign(alive_log_probs_2, alive_log_probs)
            layers.assign(finished_seq_2, finished_seq)
            layers.assign(finished_scores_2, finished_scores)
            layers.assign(finished_flags_2, finished_flags)

            for i in xrange(len(caches_2)):
                layers.assign(caches_2[i]["k"], caches[i]["k"])
                layers.assign(caches_2[i]["v"], caches[i]["v"])

            layers.logical_and(x=cond, y=finish_cond, out=cond)

        finished_flags = layers.reduce_sum(finished_flags, dim=1, keep_dim=True) / beam_size
        finished_flags = layers.cast(finished_flags, 'bool')
        mask = layers.cast(layers.reduce_any(input=finished_flags, dim=1, keep_dim=True), 'float32')
        mask = layers.expand(mask, [1, beam_size])

        mask2 = 1.0 - mask
        finished_seq = layers.cast(finished_seq, 'float32')
        alive_seq = layers.cast(alive_seq, 'float32')
        #print mask

        finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \
                        layers.elementwise_mul(alive_seq, mask2, axis = 0)
        finished_seq = layers.cast(finished_seq, 'int32')
        finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \
                            layers.elementwise_mul(alive_log_probs, mask2)
        finished_seq.persistable = True
        finished_scores.persistable = True

        return finished_seq, finished_scores
Пример #24
0
    def test_read_write(self):
        x = [
            layers.data(
                name='x0', shape=[100]), layers.data(
                    name='x1', shape=[100]), layers.data(
                        name='x2', shape=[100])
        ]

        for each_x in x:
            each_x.stop_gradient = False

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        arr = layers.array_write(x=x[0], i=i)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[1], i=i, array=arr)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[2], i=i, array=arr)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        a0 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a1 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a2 = layers.array_read(array=arr, i=i)

        mean_a0 = layers.mean(a0)
        mean_a1 = layers.mean(a1)
        mean_a2 = layers.mean(a2)

        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

        mean_x0 = layers.mean(x[0])
        mean_x1 = layers.mean(x[1])
        mean_x2 = layers.mean(x[2])

        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

        scope = core.Scope()
        cpu = core.CPUPlace()

        exe = Executor(cpu)

        tensor = numpy.random.random(size=(100, 100)).astype('float32')

        outs = exe.run(feed={'x0': tensor,
                             'x1': tensor,
                             'x2': tensor},
                       fetch_list=[a_sum, x_sum],
                       scope=scope)
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum()
            for item in exe.run(
                feed={'x0': tensor,
                      'x1': tensor,
                      'x2': tensor},
                fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
Пример #25
0
    def beam_search():
        max_len = layers.fill_constant(shape=[1],
                                       dtype=start_tokens.dtype,
                                       value=max_out_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=start_tokens.dtype,
                                        value=0,
                                        force_cpu=True)
        cond = layers.less_than(x=step_idx,
                                y=max_len)  # default force_cpu=True
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                 step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps in decoder self-attention
        # and static encoder output projections in encoder-decoder attention
        # to reduce redundant computation.
        caches = [
            {
                "k":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_key],
                    dtype=enc_output.dtype,
                    value=0),
                "v":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_value],
                    dtype=enc_output.dtype,
                    value=0),
                "static_k":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype),
                "static_v":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype)
            } for i in range(n_layer)
        ]

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
            # inplace reshape here which actually change the shape of pre_ids.
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # gather cell states corresponding to selected parent
            pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                              index=parent_idx)
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_src_attn_bias,  # cann't use lod tensor here
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype),
                y=step_idx,
                axis=0)
            logits = wrap_decoder(trg_vocab_size,
                                  max_in_len,
                                  n_layer,
                                  n_head,
                                  d_key,
                                  d_value,
                                  d_model,
                                  d_inner_hid,
                                  prepostprocess_dropout,
                                  attention_dropout,
                                  relu_dropout,
                                  preprocess_cmd,
                                  postprocess_cmd,
                                  weight_sharing,
                                  dec_inputs=(pre_ids, pre_pos, None,
                                              pre_src_attn_bias),
                                  enc_output=enc_output,
                                  caches=caches,
                                  gather_idx=parent_idx,
                                  bos_idx=bos_idx)
            # intra-beam topK
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores,
                                                 axis=0)
            # beam_search op uses lod to differentiate branches.
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx,
                return_parent_idx=True)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(gather_idx, parent_idx)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Пример #26
0
 def increment_step():
     layers.increment(self.good_steps)
Пример #27
0
    def call(self, global_img_feat, p_img_feat, embedding_fn, words=None):
        # 图片特征
        img_feat = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2, act='tanh')  # [batch, k, hid]
        img_feat_emb = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2)

        if self.mode == 'eval':
            word = layers.fill_constant_batch_size_like(global_img_feat, [-1],
                                                        dtype='int64',
                                                        value=config.data['start_idx'])
        else:
            words = layers.transpose(words, [1, 0])  # [seq, batch]
            words.stop_gradient = True
        # lstm 初始化
        hid, cell = create_zero_state(global_img_feat), create_zero_state(global_img_feat)

        # While loop 参数初始化
        mx = decoder_config['sentence_length'] - 1 if self.mode == 'train' else decoder_config['infer_max_length']
        if self.mode == 'eval':
            mx = decoder_config['infer_max_length']
            while_op_output = layers.create_array('int64')
        else:
            while_op_output = layers.create_array('float32')
        max_step = layers.fill_constant(shape=[1], dtype='int64', value=mx)
        step = layers.fill_constant(shape=[1], dtype='int64', value=0)
        cond = layers.less_than(step, max_step)
        while_op = layers.While(cond)

        with while_op.block():
            if self.mode == 'train':
                st = layers.cast(step, 'int32')
                word = layers.slice(words, axes=[0], starts=st, ends=st + 1)
                word = layers.squeeze(word, [0])
                word.stop_gradient = True

            word_emb = embedding_fn(word)
            # 这里可能用+效果更好?
            xt = layers.concat([word_emb, global_img_feat], axis=-1)  # [batch, feat]
            h, c = layers.lstm_unit(xt, hid, cell, param_attr=fluid.ParamAttr('lstm_w'),
                                    bias_attr=fluid.ParamAttr('lstm_b'))
            p_word_emb = layers.fc(xt, size=self.hid_size)
            p_hidden = layers.fc(hid, size=self.hid_size)
            sentinel_gate = layers.sigmoid(p_word_emb + p_hidden)  # [batch, hidden]
            sentinel = layers.elementwise_mul(sentinel_gate, layers.tanh(c))  # [batch, hidden]

            layers.assign(h, hid)
            layers.assign(c, cell)

            k = layers.shape(p_img_feat)[1]

            p_hid = layers.fc(h, self.hid_size, act='tanh')
            # attention 部分
            #     alpha
            hid_emb = layers.fc(p_hid, self.hid_size)  # [batch, hidden]
            exp_hid_emb = layers.expand(layers.unsqueeze(hid_emb, 1), [1, k + 1, 1])  # [batch, k+1, hidden]
            sentinel_emb = layers.unsqueeze(layers.fc(sentinel, self.hid_size), axes=1)  # [batch, 1, hidden]
            feat_emb = layers.concat([img_feat_emb, sentinel_emb], axis=1)  # [batch, k+1, hidden]
            z = layers.tanh(feat_emb + exp_hid_emb)  # [batch, k+1, 1]
            alpha = layers.fc(z, size=1, num_flatten_dims=2, act='softmax')  # [batch, k+1, 1]

            #     context vector

            context = layers.concat([img_feat, layers.unsqueeze(sentinel, axes=1)], axis=1)  # [batch, k+1, hidden]
            context = layers.elementwise_mul(context, alpha, axis=0)
            context = layers.reduce_mean(context, dim=1)  # [batch, hidden]

            out = layers.fc(context + p_hid, self.hid_size, act='tanh')

            word_pred = weight_tying_fc(out)  # [batch, vocab]

            if self.mode == 'eval':
                next_word = layers.argmax(word_pred, axis=-1)
                layers.assign(next_word, word)
                next_word = layers.cast(next_word, 'float32')
                layers.array_write(next_word, step, array=while_op_output)
            else:
                layers.array_write(word_pred, step, array=while_op_output)
            layers.increment(step)
            layers.less_than(step, max_step, cond=cond)
        if self.mode == 'train':
            output_time_major, _ = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True)
            output = layers.transpose(output_time_major, [1, 0, 2])
        else:
            output_time_major = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True)[0]
            output = layers.transpose(output_time_major, [1, 0])

        return output
Пример #28
0
    def run_main(self, place, with_data_parallel):
        self.place = place
        self.with_data_parallel = with_data_parallel

        if not core.is_compiled_with_cuda() and isinstance(
                self.place, core.CUDAPlace):
            return

        if isinstance(self.place, core.CUDAPlace):
            device_cnt = core.get_cuda_device_count(
            ) if self.with_data_parallel else 1
        else:
            device_cnt = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count())
            ) if self.with_data_parallel else 1

        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            d = layers.reshape(d, shape=[10])
            prev = layers.reshape(prev, shape=[10])
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)
            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                d2 = layers.reshape(d2, shape=[10])
                prev2 = layers.reshape(prev2, shape=[10])
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        sum_result.persistable = True
        tmp = layers.unsqueeze(sum_result, axes=[0])
        tmp = layers.expand(tmp, expand_times=[10, 1])
        fc = layers.fc(tmp, size=256)
        loss = layers.mean(sum_result)

        optim = fluid.optimizer.Adam(learning_rate=1e-3)
        optim.minimize(loss)

        exe = Executor(self.place)
        exe.run(fluid.default_startup_program())

        prog = fluid.default_main_program()
        if self.with_data_parallel:
            prog = compiler.CompiledProgram(
                fluid.default_main_program()).with_data_parallel(
                    loss_name=loss.name)

        for _ in range(5):
            d = []
            for i in range(3):
                tmp = numpy.random.random(size=[10]).astype('float32')
                if not self.with_data_parallel:
                    d.append(tmp)
                else:
                    d.append(numpy.array([tmp] * device_cnt))

            outs = exe.run(program=prog,
                           feed={
                               'd0': d[0],
                               'd1': d[1],
                               'd2': d[2]
                           },
                           fetch_list=[sum_result])
            self.assertAlmostEqual(numpy.sum(d),
                                   numpy.sum(outs[0]),
                                   delta=0.01)
Пример #29
0
    def test_hybrid_parallel_inference_helper_mp1pp2(self):

        nranks = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
        rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
        dev_id = int(os.getenv("FLAGS_selected_gpus", 0))

        main_program = paddle.static.Program()
        startup_program = paddle.static.Program()

        device = "gpu"

        with paddle.static.program_guard(main_program, startup_program):
            with paddle.fluid.device_guard(f'{device}:0'):
                X = paddle.static.data(
                    name='X', shape=[None, 2], dtype='float32')

            with paddle.fluid.device_guard(f'{device}:all'):
                max_len = layers.fill_constant(
                    shape=[1],
                    dtype="int64",
                    value=2,
                    force_cpu=False,
                    name="n")
                step_idx = layers.fill_constant(
                    shape=[1],
                    dtype="int64",
                    value=0,
                    force_cpu=False,
                    name="i")

                data = layers.array_write(X, step_idx)

                cond_int = layers.fill_constant(
                    shape=[1],
                    dtype="int64",
                    value=0,
                    force_cpu=False,
                    name="cond_int")
                cond = layers.less_than(x=step_idx, y=max_len)
                while_op = layers.While(cond, is_test=True)

            with while_op.block():
                with paddle.fluid.device_guard(f'{device}:all'):
                    input = layers.array_read(array=data, i=step_idx)
                    layers.increment(x=step_idx, value=1.0, in_place=True)
                    layers.array_write(input, i=step_idx, array=data)

                with paddle.fluid.device_guard(f'{device}:0'):
                    param_attr = paddle.ParamAttr(
                        initializer=paddle.nn.initializer.Constant(1.0))
                    weight1 = paddle.static.create_parameter(
                        shape=[2, 5],
                        dtype='float32',
                        attr=param_attr,
                        is_bias=False)
                    hidden1 = paddle.matmul(input, weight1)

                with paddle.fluid.device_guard(f'{device}:1'):
                    param_attr = paddle.ParamAttr(
                        initializer=paddle.nn.initializer.Constant(2.0))
                    weight2 = paddle.static.create_parameter(
                        shape=[5, 2],
                        dtype='float32',
                        attr=param_attr,
                        is_bias=False)
                    hidden2 = paddle.matmul(hidden1, weight2)

                    layers.array_write(hidden2, i=step_idx, array=data)

                    # update cond and assign to cond_int, we will sync cond_int
                    layers.less_than(x=step_idx, y=max_len, cond=cond)
                    layers.assign(layers.cast(cond, dtype="int32"), cond_int)

                with paddle.fluid.device_guard(f'{device}:all'):
                    # the code below must at end of while block and exists in device:all
                    layers.assign(layers.cast(cond_int, dtype='bool'), cond)

            with paddle.fluid.device_guard(f'{device}:all'):
                out = layers.create_array(data.dtype)
                layers.assign(data, out)

            with paddle.fluid.device_guard(f'{device}:all'):
                # use a empty lod_tensor_array to clear lod_tensor_array
                layers.assign(layers.create_array(data.dtype), data)

        helper = HybridParallelInferenceHelper(
            startup_program,
            main_program,
            micro_batch_size=2,
            num_mp=1,
            num_pp=2,
            init_comm=nranks > 1, )
        helper.gen_infer_program(
            ['array_write_0.out'], ['cond_int.tmp_0'], debug=True)

        exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
        exe.run(startup_program)

        for step in range(2):
            init_data = np.random.uniform(
                low=0.0, high=1.0, size=[2, 2]).astype('float32')
            [res] = exe.run(main_program,
                            feed={"X": init_data},
                            fetch_list=[out])
            res_np = numpy_while(init_data)

            assert len(res) == len(res_np)
            for d1, d2 in zip(res, res_np):
                np.testing.assert_allclose(d1, d2)
Пример #30
0
def decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(
        name="init_scores", shape=[1], dtype="float32", lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the lod of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(
            input=pre_ids,
            size=[dict_size, word_dim],
            dtype='float32',
            is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
        selected_ids, selected_scores = pd.beam_search(
            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        pd.less_than(x=counter, y=array_len, cond=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array)

    # return init_ids, init_scores

    return translation_ids, translation_scores
def decoder_decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(
        name="init_scores", shape=[1], dtype="float32", lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(
            input=pre_ids,
            size=[dict_size, word_dim],
            dtype='float32',
            is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
        # calculate accumulated scores after topk to reduce computation cost
        accu_scores = pd.elementwise_add(
            x=pd.log(topk_scores), y=pd.reshape(
                pre_score, shape=[-1]), axis=0)
        selected_ids, selected_scores = pd.beam_search(
            pre_ids,
            pre_score,
            topk_indices,
            accu_scores,
            beam_size,
            end_id=10,
            level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        # update the break condition: up to the max length or all candidates of
        # source sentences have ended.
        length_cond = pd.less_than(x=counter, y=array_len)
        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
        pd.logical_and(x=length_cond, y=finish_cond, out=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

    # return init_ids, init_scores

    return translation_ids, translation_scores
Пример #32
0
        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores
def decoder_decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(name="init_scores",
                          shape=[1],
                          dtype="float32",
                          lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the lod of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(input=pre_ids,
                                   size=[dict_size, word_dim],
                                   dtype='float32',
                                   is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=50)
        selected_ids, selected_scores = pd.beam_search(pre_ids,
                                                       topk_indices,
                                                       topk_scores,
                                                       beam_size,
                                                       end_id=10,
                                                       level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        pd.less_than(x=counter, y=array_len, cond=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array)

    # return init_ids, init_scores

    return translation_ids, translation_scores
Пример #34
0
def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
                        num_bad_steps, incr_every_n_steps,
                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
    """
    Update loss scaling according to overall gradients. If all gradients is
    finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
    Otherwise, loss scaling will decrease by decr_ratio after
    decr_every_n_nan_or_inf steps and each step some gradients are infinite.

    Args:
        is_overall_finite (Variable): A boolean variable indicates whether
                                     all gradients are finite.
        prev_loss_scaling (Variable): Previous loss scaling.
        num_good_steps (Variable): A variable accumulates good steps in which
                                   all gradients are finite.
        num_bad_steps (Variable): A variable accumulates bad steps in which
                                  some gradients are infinite.
        incr_every_n_steps (Variable): A variable represents increasing loss
                                       scaling every n consecutive steps with
                                       finite gradients.
        decr_every_n_nan_or_inf (Variable): A variable represents decreasing
                                            loss scaling every n accumulated
                                            steps with nan or inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
                           loss scaling.
    """
    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
    with layers.Switch() as switch:
        with switch.case(is_overall_finite):
            should_incr_loss_scaling = layers.less_than(
                incr_every_n_steps, num_good_steps + 1)
            with layers.Switch() as switch1:
                with switch1.case(should_incr_loss_scaling):
                    new_loss_scaling = prev_loss_scaling * incr_ratio
                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
                    with layers.Switch() as switch2:
                        with switch2.case(loss_scaling_is_finite):
                            layers.assign(new_loss_scaling, prev_loss_scaling)
                        with switch2.default():
                            pass
                    layers.assign(zero_steps, num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)

                with switch1.default():
                    layers.increment(num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)

        with switch.default():
            should_decr_loss_scaling = layers.less_than(
                decr_every_n_nan_or_inf, num_bad_steps + 1)
            with layers.Switch() as switch3:
                with switch3.case(should_decr_loss_scaling):
                    new_loss_scaling = prev_loss_scaling * decr_ratio
                    static_loss_scaling = \
                        layers.fill_constant(shape=[1],
                                             dtype='float32',
                                             value=1.0)
                    less_than_one = layers.less_than(new_loss_scaling,
                                                     static_loss_scaling)
                    with layers.Switch() as switch4:
                        with switch4.case(less_than_one):
                            layers.assign(static_loss_scaling,
                                          prev_loss_scaling)
                        with switch4.default():
                            layers.assign(new_loss_scaling, prev_loss_scaling)
                    layers.assign(zero_steps, num_good_steps)
                    layers.assign(zero_steps, num_bad_steps)
                with switch3.default():
                    layers.assign(zero_steps, num_good_steps)
                    layers.increment(num_bad_steps)