예제 #1
0
    def test_static_graph(self):
        dtype = 'float64'

        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)

        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
                                        shape=[2],
                                        dtype="int32")
        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
                                        shape=[2],
                                        dtype="int64")

        out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
        out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
        out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
        out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
        out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)

        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        res_1, res_2, res_3, res_4, res_5 = exe.run(
            fluid.default_main_program(),
            feed={
                "shape_tensor_int32": np.array([200, 3]).astype("int32"),
                "shape_tensor_int64": np.array([200, 3]).astype("int64"),
            },
            fetch_list=[out_1, out_2, out_3, out_4, out_5])

        self.__check_out__(res_1, dtype)
        self.__check_out__(res_2, dtype)
        self.__check_out__(res_3, dtype)
        self.__check_out__(res_4, dtype)
        self.__check_out__(res_5, dtype)
 def test_async_read_only_1dim(self):
     src = paddle.rand([40], dtype="float32").pin_memory()
     dst = paddle.empty([40], dtype="float32")
     buffer_ = paddle.empty([20]).pin_memory()
     with cuda.stream_guard(self.stream):
         core.async_read(src, dst, self.index, buffer_, self.empty,
                         self.empty)
     array1 = paddle.gather(src, self.index)
     array2 = dst[:len(self.index)]
     self.assertTrue(np.allclose(array1.numpy(), array2.numpy()))
예제 #3
0
 def func_setUp(self):
     self.empty = paddle.to_tensor(np.array([], dtype="int64"),
                                   place=paddle.CPUPlace())
     data = np.random.randn(100, 50, 50).astype("float32")
     self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace())
     self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32")
     self.index = paddle.to_tensor(np.array([1, 3, 5, 7, 9],
                                            dtype="int64")).cpu()
     self.buffer = paddle.empty(shape=[50, 50, 50],
                                dtype="float32").pin_memory()
     self.stream = cuda.Stream()
예제 #4
0
 def test_dygraph_api_attr(self):
     paddle.disable_static()
     shape = [200, 3]
     dtype = 'float64'
     out = paddle.empty(shape=shape, dtype=dtype)
     self.__check_out__(out, dtype)
     paddle.enable_static()
예제 #5
0
 def test_dygraph_api_out_3(self):
     paddle.disable_static()
     shape_data = np.array([200, 3]).astype('int64')
     shape = paddle.to_tensor(shape_data)
     out = paddle.empty(shape=shape)
     self.__check_out__(out)
     paddle.enable_static()
예제 #6
0
    def test_init_process_group(self):
        with _test_eager_guard():
            paddle.distributed.init_parallel_env()
            paddle.distributed.new_group()
            group = paddle.distributed.new_group([-1, -2])
            assert group.process_group == None

            group = paddle.distributed.collective.Group(-1, 2, 0, [-1, -2])
            ret = paddle.distributed.barrier(group)
            assert ret == None
        paddle.enable_static()
        in_tensor = paddle.empty((1, 2))
        in_tensor2 = paddle.empty((1, 2))
        paddle.distributed.broadcast(in_tensor, src=0)
        paddle.distributed.all_gather([in_tensor, in_tensor2], in_tensor)
        print("test ok\n")
예제 #7
0
    def init_mems(self, batch_size, d_model):
        if self.mem_len > 0:
            mems = []
            for _ in range(self.n_layer + 1):
                empty = paddle.empty(
                    shape=[batch_size, 0, d_model], dtype=global_dtype)
                mems.append(empty)

            return mems
        else:
            return None
예제 #8
0
 def test_uniform_random_inplace_op_empty_tensor(self):
     places = ['cpu']
     if fluid.core.is_compiled_with_cuda():
         places.append('gpu')
     test_shapes = [(200, 0), (0, 200)]
     for place in places:
         paddle.set_device(place)
         for test_shape in test_shapes:
             tensor = paddle.empty(shape=test_shape)
             tensor.uniform_()
             tensor_shape_np = np.array(tensor.shape)
             origin_shape = np.array(test_shape)
             self.assertTrue((tensor_shape_np == origin_shape).all())
예제 #9
0
파일: utils.py 프로젝트: sandyhouse/Paddle
def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return

    if in_dygraph_mode():
        group = paddle.distributed.collective._get_default_group(
        ) if group is None else group
        out = paddle.empty(in_tensor_list.shape, in_tensor_list.dtype)
        task = group.process_group.alltoall(in_tensor_list, out)
        task.wait()
        return out
    else:
        ring_id = 0 if group is None else group.id
        return paddle._C_ops.alltoall(in_tensor_list, 'use_calc_stream',
                                      use_calc_stream, 'ring_id', ring_id)
예제 #10
0
def _all_gather(tensor, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return

    if in_dygraph_mode():
        group = paddle.distributed.collective._get_default_group(
        ) if group is None else group
        tensor_shape = list(tensor.shape)
        tensor_shape[0] *= group.nranks
        out = paddle.empty(tensor_shape, tensor.dtype)

        task = group.process_group.all_gather(tensor, out)
        task.wait()
        return out
    else:
        ring_id = 0 if group is None else group.id
        nranks = paddle.distributed.collective._get_global_group(
        ).nranks if group is None else group.nranks
        return paddle._C_ops.c_allgather(tensor, 'use_calc_stream',
                                         use_calc_stream, 'ring_id', ring_id,
                                         'nranks', nranks)
예제 #11
0
def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
    op_type = 'c_allgather'

    if in_dygraph_mode():
        group = paddle.distributed.collective._get_default_group()
        tensor_shape = list(x.shape)
        tensor_shape[0] *= nranks
        out = paddle.empty(tensor_shape, x.dtype)
        task = group.process_group.all_gather(x, out)
        task.wait()
        return out

    if _in_legacy_dygraph():
        attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
                 use_calc_stream)
        return _C_ops.c_allgather(x, *attrs)

    helper = LayerHelper(op_type, **locals())
    out_shape = list(x.shape[:])
    if out_shape[0] > 0:
        out_shape[0] *= nranks
    out = helper.create_variable(name=unique_name.generate_with_ignorable_key(
        '.'.join([x.name, op_type])),
                                 shape=out_shape,
                                 dtype=x.dtype,
                                 type=x.type,
                                 persistable=x.persistable)
    helper.append_op(type=op_type,
                     inputs={'X': [x]},
                     outputs={'Out': [out]},
                     attrs={
                         'nranks': nranks,
                         'ring_id': ring_id,
                         'use_calc_stream': use_calc_stream
                     })
    return out
예제 #12
0
 def test_dtype():
     shape = [200, 3]
     dtype = 'uint8'
     result = paddle.empty(shape=shape, dtype=dtype)
 def setUp(self):
     self.src = paddle.rand(shape=[100, 50, 50, 5], dtype="float32")
     self.dst = paddle.empty(
         shape=[200, 50, 50, 5], dtype="float32").pin_memory()
     self.stream = cuda.Stream()
예제 #14
0
def _local_scatter(inp, pos):
    if pos.shape != [0]:
        inp_buf = paddle.index_select(inp, pos, 0)
    else:
        inp_buf = paddle.empty([0, inp.shape[1]], dtype=inp.dtype)
    return inp_buf
예제 #15
0
def taylor(M: int,
           nbar=4,
           sll=30,
           norm=True,
           sym: bool = True,
           dtype: str = 'float64') -> Tensor:
    """Compute a Taylor window.
    The Taylor window taper function approximates the Dolph-Chebyshev window's
    constant sidelobe level for a parameterized number of near-in sidelobes.
    Parameters:
        M(int): window size
        nbar, sil, norm: the window-specific parameter.
        sym(bool):whether to return symmetric window.
            The default value is True
        dtype(str): the datatype of returned tensor.
    Returns:
        Tensor: the window tensor
    Notes:
        This function is consistent with scipy.signal.windows.taylor().
    """
    if _len_guards(M):
        return paddle.ones((M, ), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    # Original text uses a negative sidelobe level parameter and then negates
    # it in the calculation of B. To keep consistent with other methods we
    # assume the sidelobe level parameter to be positive.
    B = 10**(sll / 20)
    A = _acosh(B) / math.pi
    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
    ma = paddle.arange(1, nbar, dtype=dtype)

    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
    signs = paddle.empty_like(ma)
    signs[::2] = 1
    signs[1::2] = -1
    m2 = ma * ma
    for mi in range(len(ma)):
        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 +
                                                           (ma - 0.5)**2))
        if mi == 0:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
        elif mi == len(ma) - 1:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
        else:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(
                1 - m2[mi] / m2[mi + 1:])

        Fm[mi] = numer / denom

    def W(n):
        return 1 + 2 * paddle.matmul(
            Fm.unsqueeze(0),
            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))

    w = W(paddle.arange(0, M, dtype=dtype))

    # normalize (Note that this is not described in the original text [1])
    if norm:
        scale = 1.0 / W((M - 1) / 2)
        w *= scale
    w = w.squeeze()
    return _truncate(w, needs_trunc)
예제 #16
0
    def test_fixed_random_number(self):
        if not paddle.is_compiled_with_cuda():
            return

        # Note(zhouwei): The Number of threads is determined by
        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different
        # GPU have different number of threads, which result in different
        # random value. Only test on V100 GPU here.
        if not "V100" in paddle.device.cuda.get_device_name():
            return

        print("Test Fixed Random number on V100 GPU------>")
        paddle.disable_static()
        paddle.set_device('gpu')
        paddle.seed(2021)

        x = paddle.empty([64, 3, 1024, 1024], dtype="float32")
        x.exponential_(1.0)
        x_np = x.numpy()
        expect = [
            0.80073667, 0.2249291, 0.07734892, 1.25392, 0.14013891, 0.45736602,
            1.9735607, 0.30490234, 0.57100505, 0.8115938
        ]

        self.assertTrue(np.allclose(x_np[0, 0, 0, 0:10], expect))
        expect = [
            1.4296371e+00, 9.5411777e-01, 5.2575850e-01, 2.4805880e-01,
            1.2322118e-04, 8.4604341e-01, 2.1111444e-01, 1.4143821e+00,
            2.8194717e-01, 1.1360573e+00
        ]
        self.assertTrue(np.allclose(x_np[16, 1, 300, 200:210], expect))
        expect = [
            1.3448033, 0.35146526, 1.7380928, 0.32012638, 0.10396296,
            0.51344526, 0.15308502, 0.18712929, 0.03888268, 0.20771872
        ]
        self.assertTrue(np.allclose(x_np[32, 1, 600, 500:510], expect))
        expect = [
            0.5107464, 0.20970327, 2.1986802, 1.580056, 0.31036147, 0.43966478,
            0.9056133, 0.30119267, 1.4797124, 1.4319834
        ]
        self.assertTrue(np.allclose(x_np[48, 2, 900, 800:810], expect))
        expect = [
            3.4640615, 1.1019983, 0.41195083, 0.22681557, 0.291846, 0.53617656,
            1.5791925, 2.4645927, 0.04094889, 0.9057725
        ]
        self.assertTrue(np.allclose(x_np[63, 2, 1023, 1000:1010], expect))

        x = paddle.empty([10, 10], dtype="float32")
        x.exponential_(3.0)
        x_np = x.numpy()
        expect = [
            0.02831675, 0.1691551, 0.6798956, 0.69347525, 0.0243443,
            0.22180498, 0.30574575, 0.9839696, 0.2834912, 0.59420055
        ]
        self.assertTrue(np.allclose(x_np[5, 0:10], expect))

        x = paddle.empty([16, 2, 1024, 768], dtype="float64")
        x.exponential_(0.25)
        x_np = x.numpy()
        expect = [
            10.0541229, 12.67860643, 1.09850734, 7.35289643, 2.65471225,
            3.86217432, 2.97902086, 2.92744479, 2.67927152, 0.19667352
        ]
        self.assertTrue(np.allclose(x_np[0, 0, 0, 100:110], expect))
        expect = [
            0.68328125, 3.1454553, 0.92158376, 1.95842188, 1.05296941,
            12.93242051, 5.20255978, 3.3588624, 1.57377174, 5.73194183
        ]
        self.assertTrue(np.allclose(x_np[4, 0, 300, 190:200], expect))
        expect = [
            1.37973974, 3.45036798, 7.94625406, 1.62610973, 0.31032122,
            4.13596493, 1.98494535, 1.13207041, 8.30592769, 2.81460147
        ]
        self.assertTrue(np.allclose(x_np[8, 1, 600, 300:310], expect))
        expect = [
            2.27710811, 12.25003028, 2.96409124, 4.72405788, 0.67917249,
            4.35856718, 0.46870976, 2.31120149, 9.61595826, 4.64446271
        ]
        self.assertTrue(np.allclose(x_np[12, 1, 900, 500:510], expect))
        expect = [
            0.95883744, 1.57316361, 15.22524512, 20.49559882, 13.70008548,
            3.29430143, 3.90390424, 0.9146657, 0.80972249, 0.33376219
        ]
        self.assertTrue(np.allclose(x_np[15, 1, 1023, 750:760], expect))

        x = paddle.empty([512, 768], dtype="float64")
        x.exponential_(0.3)
        x_np = x.numpy()
        expect = [
            8.79266704, 4.79596009, 2.75480243, 6.04670011, 0.35379556,
            0.76864868, 3.17428251, 0.26556859, 12.22485885, 10.51690383
        ]
        self.assertTrue(np.allclose(x_np[0, 200:210], expect))
        expect = [
            5.6341126, 0.52243418, 5.36410796, 6.83672002, 11.9243311,
            5.85985566, 5.75169548, 0.13877972, 6.1348385, 3.82436519
        ]
        self.assertTrue(np.allclose(x_np[300, 400:410], expect))
        expect = [
            4.94883581, 0.56345306, 0.85841585, 1.92287801, 6.10036656,
            1.19524847, 3.64735434, 5.19618716, 2.57467974, 3.49152791
        ]
        self.assertTrue(np.allclose(x_np[500, 700:710], expect))

        x = paddle.empty([10, 10], dtype="float64")
        x.exponential_(4.0)
        x_np = x.numpy()
        expect = [
            0.15713826, 0.56395964, 0.0680941, 0.00316643, 0.27046853,
            0.19852724, 0.12776634, 0.09642974, 0.51977551, 1.33739699
        ]
        self.assertTrue(np.allclose(x_np[5, 0:10], expect))

        paddle.enable_static()
예제 #17
0
    def __init__(self,
                 rank,
                 local_rank,
                 world_size,
                 batch_size,
                 resume,
                 margin_softmax,
                 num_classes,
                 sample_rate=1.0,
                 embedding_size=512,
                 prefix="./"):
        super(PartialFC, self).__init__()
        self.num_classes: int = num_classes
        self.rank: int = rank
        self.local_rank: int = local_rank
        self.world_size: int = world_size
        self.batch_size: int = batch_size
        self.margin_softmax: callable = margin_softmax
        self.sample_rate: float = sample_rate
        self.embedding_size: int = embedding_size
        self.prefix: str = prefix
        self.num_local: int = num_classes // world_size + int(
            rank < num_classes % world_size)
        self.class_start: int = num_classes // world_size * rank + min(
            rank, num_classes % world_size)
        self.num_sample: int = int(self.sample_rate * self.num_local)

        self.weight_name = os.path.join(
            self.prefix, "rank:{}_softmax_weight.pkl".format(self.rank))
        self.weight_mom_name = os.path.join(
            self.prefix, "rank:{}_softmax_weight_mom.pkl".format(self.rank))

        if resume:
            try:
                self.weight: paddle.Tensor = paddle.load(self.weight_name)
                print("softmax weight resume successfully!")
            except (FileNotFoundError, KeyError, IndexError):
                self.weight = paddle.normal(
                    0, 0.01, (self.num_local, self.embedding_size))
                print("softmax weight resume fail!")

            try:
                self.weight_mom: paddle.Tensor = paddle.load(
                    self.weight_mom_name)
                print("softmax weight mom resume successfully!")
            except (FileNotFoundError, KeyError, IndexError):
                self.weight_mom: paddle.Tensor = paddle.zeros_like(self.weight)
                print("softmax weight mom resume fail!")
        else:
            self.weight = paddle.normal(0, 0.01,
                                        (self.num_local, self.embedding_size))
            self.weight_mom: paddle.Tensor = paddle.zeros_like(self.weight)
            print("softmax weight init successfully!")
            print("softmax weight mom init successfully!")

        self.index = None
        if int(self.sample_rate) == 1:
            self.update = lambda: 0
            self.sub_weight = paddle.create_parameter(
                shape=self.weight.shape,
                dtype='float32',
                default_initializer=paddle.nn.initializer.Assign(self.weight))
            self.sub_weight_mom = self.weight_mom
        else:
            self.sub_weight = paddle.create_parameter(
                shape=[1, 1],
                dtype='float32',
                default_initializer=paddle.nn.initializer.Assign(
                    paddle.empty((1, 1))))
예제 #18
0
def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
    global _hcg

    tensor_recv_prev = None
    tensor_recv_next = None

    # send / recv message
    recv_shape_msg = _send_recv_meta.recv_shape_message
    recv_dtype_msg = _send_recv_meta.recv_dtype_message
    recv_stop_gradient = _send_recv_meta.recv_stop_gradient

    send_shape_msg = _send_recv_meta.send_shape_message
    send_dtype_msg = _send_recv_meta.send_dtype_message

    # model parallel message
    mp_group = _hcg.get_model_parallel_group()
    mp_degree = _hcg.get_model_parallel_world_size()
    mp_rank = _hcg.get_model_parallel_rank()

    if recv_prev:
        if isinstance(recv_shape_msg, tuple):
            tensor_recv_prev = []
            for idx, shape in enumerate(recv_shape_msg):
                tmp = paddle.empty(shape=shape,
                                   dtype=number_2_dtype(recv_dtype_msg[idx]))
                tmp.stop_gradient = recv_stop_gradient[idx]
                tensor_recv_prev.append(tmp)
            tensor_recv_prev = tuple(tensor_recv_prev)
        else:

            tensor_recv_prev = paddle.empty(
                shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg))
            tensor_recv_prev.stop_gradient = recv_stop_gradient

    if recv_next:
        if isinstance(send_shape_msg, tuple):
            tensor_recv_next = []
            for idx, shape in enumerate(send_shape_msg):
                tensor_recv_next.append(
                    paddle.empty(shape=shape,
                                 dtype=number_2_dtype(send_dtype_msg[idx])))
            tensor_recv_next = tuple(tensor_recv_next)
        else:
            tensor_recv_next = paddle.empty(
                shape=send_shape_msg, dtype=number_2_dtype(send_dtype_msg))

    # start to p2p communicate
    if tensor_send_prev is not None:
        if isinstance(tensor_send_prev, tuple):
            for d in tensor_send_prev:
                paddle.distributed.wait(d, use_calc_stream=True)
                send_partial(d,
                             dst=0,
                             nranks=mp_degree,
                             rank_id=mp_rank,
                             group=_hcg.send_prev_group,
                             use_calc_stream=False)
        else:
            paddle.distributed.wait(tensor_send_prev, use_calc_stream=True)
            send_partial(tensor_send_prev,
                         dst=0,
                         nranks=mp_degree,
                         rank_id=mp_rank,
                         group=_hcg.send_prev_group,
                         use_calc_stream=False)

    if tensor_recv_prev is not None:
        if isinstance(tensor_recv_prev, tuple):
            for d in tensor_recv_prev:
                recv_partial(d,
                             src=0,
                             nranks=mp_degree,
                             rank_id=mp_rank,
                             group=_hcg.recv_prev_group,
                             use_calc_stream=True)
                allgather_partial(d,
                                  nranks=mp_degree,
                                  rank_id=mp_rank,
                                  group=mp_group,
                                  use_calc_stream=True)
        else:
            recv_partial(tensor_recv_prev,
                         src=0,
                         nranks=mp_degree,
                         rank_id=mp_rank,
                         group=_hcg.recv_prev_group,
                         use_calc_stream=True)
            allgather_partial(tensor_recv_prev,
                              nranks=mp_degree,
                              rank_id=mp_rank,
                              group=mp_group,
                              use_calc_stream=True)

    if tensor_send_next is not None:
        if isinstance(tensor_send_next, tuple):
            for d in tensor_send_next:
                paddle.distributed.wait(d, use_calc_stream=True)
                send_partial(d,
                             dst=1,
                             nranks=mp_degree,
                             rank_id=mp_rank,
                             group=_hcg.send_next_group,
                             use_calc_stream=False)
        else:
            paddle.distributed.wait(tensor_send_next, use_calc_stream=True)
            send_partial(tensor_send_next,
                         dst=1,
                         nranks=mp_degree,
                         rank_id=mp_rank,
                         group=_hcg.send_next_group,
                         use_calc_stream=False)

    if tensor_recv_next is not None:
        if isinstance(tensor_recv_next, tuple):
            for d in tensor_recv_next:
                recv_partial(d,
                             src=1,
                             nranks=mp_degree,
                             rank_id=mp_rank,
                             group=_hcg.recv_next_group,
                             use_calc_stream=True)
                allgather_partial(d,
                                  nranks=mp_degree,
                                  rank_id=mp_rank,
                                  group=mp_group,
                                  use_calc_stream=True)

        else:
            recv_partial(tensor_recv_next,
                         src=1,
                         nranks=mp_degree,
                         rank_id=mp_rank,
                         group=_hcg.recv_next_group,
                         use_calc_stream=True)

            allgather_partial(tensor_recv_next,
                              nranks=mp_degree,
                              rank_id=mp_rank,
                              group=mp_group,
                              use_calc_stream=True)
    return tensor_recv_prev, tensor_recv_next
예제 #19
0
 def idx_empty(var):
     var_shape = list(var.shape)
     var_shape[0] = 0
     return paddle.empty(var_shape, dtype=var.dtype)
예제 #20
0
 def test_dygraph_api_out(self):
     paddle.disable_static()
     shape = [200, 3]
     out = paddle.empty(shape=shape)
     self.__check_out__(out)
     paddle.enable_static()
예제 #21
0
    def __init__(
        self,
        embed_dim,
        # vision
        image_resolution,
        vision_layers,
        vision_width,
        vision_patch_size,
        # text
        context_length,
        vocab_size,
        transformer_width,
        transformer_heads,
        transformer_layers,
    ):
        super().__init__()
        self.context_length = context_length
        self.embed_dim = embed_dim

        if isinstance(vision_layers, (tuple, list)):
            vision_heads = vision_width * 32 // 64
            self.visual = ModifiedResNet(
                layers=vision_layers,
                output_dim=embed_dim,
                heads=vision_heads,
                input_resolution=image_resolution,
                width=vision_width,
            )
        else:
            vision_heads = vision_width // 64
            self.visual = VisualTransformer(
                input_resolution=image_resolution,
                patch_size=vision_patch_size,
                width=vision_width,
                layers=vision_layers,
                heads=vision_heads,
                output_dim=embed_dim,
            )

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask(),
        )

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)

        positional_embedding = self.create_parameter(
            shape=(self.context_length, transformer_width),
            default_initializer=Assign(
                paddle.empty((self.context_length, transformer_width))
            ),
        )
        self.add_parameter("positional_embedding", positional_embedding)

        self.ln_final = nn.LayerNorm(transformer_width)

        text_projection = self.create_parameter(
            shape=(transformer_width, embed_dim),
            default_initializer=Assign(paddle.empty((transformer_width, embed_dim))),
        )
        self.add_parameter("text_projection", text_projection)

        logit_scale = self.create_parameter(
            shape=(1,), default_initializer=Assign(paddle.ones([1]))
        )
        self.add_parameter("logit_scale", logit_scale)

        self.initialize_parameters()