Exemplo n.º 1
0
def model(X, M, Y, train=False, reuse=False):
    with tf.variable_scope('model', reuse=reuse):
        we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02))
        we = dropout(we, embd_pdrop, train)

        X = tf.reshape(X, [-1, n_ctx, 2])
        M = tf.reshape(M, [-1, n_ctx])

        h = embed(X, we)
        for layer in range(n_layer):
            h = block(h, 'h%d'%layer, train=train, scale=True)

        lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
        lm_logits = tf.matmul(lm_h, we, transpose_b=True)
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
        lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1])
        lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1)

        clf_h = tf.reshape(h, [-1, n_embd])
        pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
        clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx)

        clf_h = tf.reshape(clf_h, [-1, 2, n_embd])
        if train and clf_pdrop > 0:
            shape = shape_list(clf_h)
            shape[1] = 1
            clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape)
        clf_h = tf.reshape(clf_h, [-1, n_embd])
        clf_logits = clf(clf_h, 1, train=train)
        clf_logits = tf.reshape(clf_logits, [-1, 2])

        clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y)
        return clf_logits, clf_losses, lm_losses
Exemplo n.º 2
0
def conv1d(x,
           scope,
           nf,
           rf,
           w_init=tf.random_normal_initializer(stddev=0.02),
           b_init=tf.constant_initializer(0),
           pad='VALID',
           train=False):
    """
    for train, x: embed_input [? (len of input seq for curr /gpu:X), 77, 768], scope, nf: (n_state (i.e. last element in
                    x shape_list = 768 (for bpe)) * 3), rf: 1,
    for mlp, nf: (last element in x shape_list) * 4; rf: still = 1
    w: [1, nx (last element in input x), nf (last element in input x * 3)]   # 768*3 for q, k, v
    b: nf = (last element in input x * 3)
    if rf==1: Basically, reshape x and w to multiple 1-D tensors, perform dot-product, add bias and the reshape to
                output format (see next comment)
    output format: list of x shape_list except the last value (i.e [?, 77]), concatenated with nf [768 * 3]
    """
    with tf.variable_scope(scope):
        nx = utils.shape_list(x)[-1]  # last value in x shape_list
        w = tf.get_variable("w", [rf, nx, nf], initializer=w_init)
        b = tf.get_variable("b", [nf], initializer=b_init)
        if rf == 1:  # faster 1x1 conv
            c = tf.reshape(
                tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) +
                b,
                utils.shape_list(x)[:-1] + [nf])
        else:  # was used to train LM
            c = tf.nn.conv1d(x, w, stride=1, padding=pad) + b
        return c
Exemplo n.º 3
0
def model(X, M, train=False, reuse=False):
    with tf.variable_scope('model', reuse=reuse):
        we = tf.get_variable(
            "we", [N_VOCAB + N_CTX, N_EMBD],
            initializer=tf.random_normal_initializer(stddev=0.02))
        we = dropout(we, EMBD_PDROP, train)

        X = tf.reshape(X, [-1, N_CTX, 2])
        M = tf.reshape(M, [-1, N_CTX])

        h = embed(X, we)
        for layer in range(N_LAYER):
            h = block(h, 'h%d' % layer, train=train, scale=True)

        lm_h = tf.reshape(h, [-1, N_EMBD])
        lm_logits = tf.reshape(
            tf.matmul(lm_h, we[:N_VOCAB, :], transpose_b=True),
            [-1, N_CTX, N_VOCAB])
        lm_logits_truncated = tf.reshape(lm_logits[:, :-1], [-1, N_VOCAB])
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=lm_logits_truncated, labels=tf.reshape(X[:, 1:, 0], [-1]))
        lm_losses = tf.reshape(
            lm_losses,
            [shape_list(X)[0], shape_list(X)[1] - 1])
        lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(
            M[:, 1:], 1)
        return lm_logits, lm_losses
Exemplo n.º 4
0
def blocksparse_attention_impl(q,
                               k,
                               v,
                               heads,
                               attn_mode,
                               local_attn_ctx=None,
                               blocksize=32,
                               num_verts=None,
                               vertsize=None):
    n_ctx = shape_list(q)[1]
    if attn_mode == 'strided':
        # Strided attention is implemented on the transposed matrix to provide greater block sparsity
        q = strided_transpose(q, n_ctx, local_attn_ctx, blocksize)
        k = strided_transpose(k, n_ctx, local_attn_ctx, blocksize)
        v = strided_transpose(v, n_ctx, local_attn_ctx, blocksize)
    n_state = shape_list(q)[-1] // heads
    bst = get_blocksparse_obj(n_ctx, heads, attn_mode, blocksize,
                              local_attn_ctx, num_verts, vertsize)
    scale_amount = tf.cast(1.0 / np.sqrt(n_state), tf.float32)
    w = bst.query_key_op(q, k)
    w = bst.masked_softmax(w, scale=scale_amount)
    a = bst.weight_value_op(w, v)
    if attn_mode == 'strided':
        n, t, embd = shape_list(a)
        bT_ctx = n_ctx // local_attn_ctx
        a = tf.reshape(a, [n, local_attn_ctx, bT_ctx, embd])
        a = tf.transpose(a, [0, 2, 1, 3])
        a = tf.reshape(a, [n, t, embd])
    return a
Exemplo n.º 5
0
    def model(self, X, M, train=False, reuse=False, num_ps=1):
        with tf.variable_scope(
                'model_lm',
                reuse=reuse,
                partitioner=tf.fixed_size_partitioner(num_shards=16)):
            we = tf.get_variable(
                "we", [self.n_vocab + self.n_special + n_ctx, n_embd],
                initializer=tf.random_normal_initializer(stddev=0.02))
            we = dropout(we, embd_pdrop, train)

            X = tf.reshape(X, [-1, n_ctx, 2])
            M = tf.reshape(M, [-1, n_ctx])

            h = embed(X, we)
            for layer in range(n_layer):
                h = block(h, 'h%d' % layer, train=train, scale=True)

            lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
            lm_logits = tf.matmul(lm_h, we, transpose_b=True)
            lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
            lm_losses = tf.reshape(
                lm_losses,
                [shape_list(X)[0], shape_list(X)[1] - 1])
            lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(
                M[:, 1:], 1)

            return lm_losses
Exemplo n.º 6
0
def model(X,
          M,
          Y,
          train=False,
          reuse=False):  ## X: [8, 2, 77, 2], M: [8, 2, 77], Y: [8]
    with tf.variable_scope('model', reuse=reuse):
        ### n_vocab: 40478, n_special: 3, n_ctx: 77, n_embed: 768
        we = tf.get_variable("we", [n_vocab + n_special + n_ctx, n_embd],
                             initializer=tf.random_normal_initializer(
                                 stddev=0.02))  ## we: [40558, 768]
        we = dropout(we, embd_pdrop, train)

        X = tf.reshape(X, [-1, n_ctx, 2])  ## X: [16, 77, 2]
        M = tf.reshape(M, [-1, n_ctx])  ## X: [16, 77]

        h = embed(
            X,
            we)  ## h: [-1, n_ctx, n_embed] : h0 = UWe + Wp (이 부분에서 Wp 부분은 안보임)
        for layer in range(n_layer):  ## n_layer: 12
            h = block(h, 'h%d' % layer, train=train, scale=True)

        lm_h = tf.reshape(h[:, :-1], [-1, n_embd])  ## lm_h: [1216, 768]
        lm_logits = tf.matmul(lm_h, we,
                              transpose_b=True)  ## lm_logits: [1216, 40558]
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=lm_logits, labels=tf.reshape(
                X[:, 1:,
                  0], [-1]))  ## lm_loss: [1216], P(u) = shftmax(hn, WeT)
        lm_losses = tf.reshape(
            lm_losses,
            [shape_list(X)[0], shape_list(X)[1] - 1])  ## lm_loss: [16, 76]
        lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(
            M[:, 1:], 1)  ## lm_loss: [16]

        clf_h = tf.reshape(h, [-1, n_embd])  ## clf_h: [1232, 768]
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32)  ## pool_idx: [16], string length
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * n_ctx + pool_idx
        )  # clf_h: [16, 768], https://www.tensorflow.org/api_docs/python/tf/gather, https://www.tensorflow.org/api_docs/python/tf/range

        clf_h = tf.reshape(clf_h, [-1, 2, n_embd])  ## chf_h: [8, 2, 768]
        if train and clf_pdrop > 0:
            shape = shape_list(clf_h)
            shape[1] = 1
            clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape)
        clf_h = tf.reshape(clf_h, [-1, n_embd])  ## clf_h: [16, 768]
        clf_logits = clf(clf_h, 1,
                         train=train)  ## clf_logits: [16, 1], hm * WyT 실행
        clf_logits = tf.reshape(clf_logits, [-1, 2])  ## clf_logits: [8, 2]

        clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=clf_logits, labels=Y)  ## P(u) = shftmax(hm * WyT)
        return clf_logits, clf_losses, lm_losses
Exemplo n.º 7
0
def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        w = tf.get_variable("w", [rf, nx, nf], initializer=w_init)
        b = tf.get_variable("b", [nf], initializer=b_init)
        if rf == 1:  # faster 1x1 conv
            c = tf.reshape(tf.matmul(tf.reshape(
                x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, shape_list(x)[:-1]+[nf])
        else:  # was used to train LM
            c = tf.nn.conv1d(x, w, stride=1, padding=pad)+b
        return c
Exemplo n.º 8
0
def model_pw(X, M, Y, train=False, reuse=False, ordinal=False):
    """
        X: [batch, n_ctx, 2]
        M: [batch, n_ctx]
        Y: [batch]
    """
    with tf.variable_scope('model', reuse=reuse):
        we = tf.get_variable(
            "we", [n_vocab + n_special + n_ctx, n_embd],
            initializer=tf.random_normal_initializer(stddev=0.02))
        we = dropout(we, embd_pdrop, train)

        #transformer blocks
        h = embed(X, we)
        for layer in range(n_layer):
            h = block(h, 'h%d' % layer, train=train, scale=True)

        #language modeling objective
        lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
        lm_logits = tf.matmul(lm_h, we, transpose_b=True)
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
        lm_losses = tf.reshape(
            lm_losses,
            [shape_list(X)[0], shape_list(X)[1] - 1])
        lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(
            M[:, 1:], 1)

        clf_h = tf.reshape(h, [-1, n_embd])
        #get length of each example
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32)
        #just takes the state of the transformer at the end of the input, I think?
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * n_ctx + pool_idx)

        #reshape to [batch, embed size]
        clf_h = tf.reshape(clf_h, [-1, 1, n_embd])
        if train and clf_pdrop > 0:
            shape = shape_list(clf_h)
            shape[1] = 1
            clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape)
        #put tensor back into (batch, embed size) shape
        clf_h = tf.reshape(clf_h, [-1, n_embd])
        #linear layer
        clf_logits = clf_pw(clf_h, train=train, ordinal=ordinal)

        #final softmax
        clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=clf_logits, labels=Y)

        return clf_logits, clf_losses, lm_losses
Exemplo n.º 9
0
def model(X, M, Y, Is_train, data_params):

    n_vocab = data_params['n_vocab']
    n_special = data_params['n_special']
    max_word = data_params['max_word']
    clf_token = data_params['clf_token']
    clf_pdrop = 0.1
    with tf.variable_scope('transformer', reuse=False):
        we = tf.get_variable(
            "we", [n_vocab + n_special + max_word, 768],
            initializer=tf.random_normal_initializer(stddev=0.02))
        we = dropout(we, 0.1, True)

        X = tf.reshape(
            X, [-1, max_word, 2])  # (batch * 1sent, 161, 2) == (8,161,2)
        M = tf.reshape(M, [-1, max_word])

        h = embed(X, we)  # (8, 161, 768)

        for layer in range(12):
            h = block(h, 'h%d' % layer, train=True, scale=True)

        lm_h = tf.reshape(h[:, :-1], [-1, 768])
        lm_logits = tf.matmul(lm_h, we, transpose_b=True)
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
        lm_losses = tf.reshape(
            lm_losses,
            [shape_list(X)[0], shape_list(X)[1] - 1])
        lm_losses = tf.reduce_sum(lm_losses * M[:, 1:], 1) / tf.reduce_sum(
            M[:, 1:], 1)

        clf_h = tf.reshape(h, [-1, 768])  # h:(8,161,768)  clf_h:(1288, 768)
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32)  # last: clf_token
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * max_word +
            pool_idx)  # (8,768)
        clf_h = tf.reshape(clf_h, [-1, 1, 768])  # (4, 2, 768)
        if True and clf_pdrop > 0:
            shape = shape_list(clf_h)
            shape[1] = 1
            clf_h = tf.nn.dropout(clf_h, 1 - clf_pdrop, shape)
        clf_h = tf.reshape(clf_h, [-1, 768])  # 8*1*768
        clf_logits = clf(clf_h, 2, train=True)  # 1 sent-->3 classes
        clf_logits = tf.reshape(clf_logits, [-1, 2])
        clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=clf_logits, labels=Y)
        return clf_logits, tf.reduce_mean(clf_losses), tf.reduce_mean(
            lm_losses)
Exemplo n.º 10
0
def conv1d(x,
           scope,
           nf,
           w_init=tf.random_normal_initializer(stddev=0.02),
           b_init=tf.constant_initializer(0),
           pad='VALID',
           train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        w = tf.get_variable("w", [nx, nf], initializer=w_init)
        b = tf.get_variable("b", [nf], initializer=b_init)
        c = tf.reshape(
            tf.matmul(tf.reshape(x, [-1, nx]), w) + b,
            shape_list(x)[:-1] + [nf])
        return c
Exemplo n.º 11
0
def norm(x, scope, axis=[-1]):
    with tf.variable_scope(scope):
        n_state = shape_list(x)[-1]
        g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1))
        b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0))
        g, b = get_ema_vars(g, b)
        return _norm(x, g, b, axis=axis)
Exemplo n.º 12
0
 def mask_attn_weights(self, w):
     # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
     _, _, nd, ns = utils.shape_list(w)
     b = utils.attention_mask(nd, ns, dtype=w.dtype)
     b = torch.reshape(b, [1, 1, nd, ns])
     w = w * b - torch.Tensor([1e10]).to(w.dtype) * (1 - b)
     return w
Exemplo n.º 13
0
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
        return c
Exemplo n.º 14
0
 def mask_attn_weights(w):
     # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
     _, _, nd, ns = shape_list(w)
     b = attention_mask(nd, ns, dtype=w.dtype)
     b = tf.reshape(b, [1, 1, nd, ns])
     w = w*b - tf.cast(1e10, w.dtype)*(1-b)
     return w
Exemplo n.º 15
0
    def dec_block(self,
                  x,
                  scope,
                  train=False,
                  scale=False,
                  encoder_output=None):
        with tf.variable_scope(
                scope
        ):  # scope: h%d    (initialized with the loaded params_d.npy values)
            nx = utils.shape_list(x)[-1]
            a = self.attn(x,
                          'attn',
                          nx,
                          self.params.n_head,
                          train=train,
                          scale=scale)
            n = norm(x + a, 'ln_1')

            if encoder_output is not None:  # encoder-decoder attn performed
                a = self.attn(n,
                              'attn_enc_dec',
                              nx,
                              self.params.n_head,
                              train=train,
                              scale=scale,
                              encoder_output=encoder_output,
                              use_mask_attn=False)
                n = norm(n + a, 'ln_enc_dec')

            m = self.mlp(n, 'mlp', nx * 4, train=train)
            h = norm(n + m, 'ln_2')
            return h
Exemplo n.º 16
0
def attention_impl(q, k, v, heads, attn_mode, local_attn_ctx=None):
    q = split_heads(q, heads)
    k = split_heads(k, heads)
    v = split_heads(v, heads)
    n_timesteps = shape_list(k)[2]
    mask = tf.to_float(get_attn_mask(n_timesteps, attn_mode, local_attn_ctx))
    w = tf.matmul(q, k, transpose_b=True)
    scale_amount = 1.0 / np.sqrt(shape_list(q)[-1])
    w = tf.cast(w, tf.float32)
    w = w * scale_amount
    w = w * mask + -1e9 * (1 - mask)
    w = tf.nn.softmax(w)
    w = tf.cast(w, tf.float16)
    a = tf.matmul(w, v)
    a = merge_heads(a)
    return a
Exemplo n.º 17
0
def merge_states(x):
    """
    reshape (batch, pixel, head, head_state) -> (batch, pixel, state)
    """
    x_shape = shape_list(x)
    new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])]
    return tf.reshape(x, new_x_shape)
 def call(self, inputs, **kwargs):
     if self.rf == 1:
         c = tf.reshape(tf.matmul(tf.reshape(inputs, [-1, self.nx]), tf.reshape(self.w, [-1, self.nf])) + self.b,
                        shape_list(inputs)[:-1] + [self.nf])
     else:
         c = tf.nn.conv1d(value=inputs, filters=self.w, stride=1, padding='VALID') + self.b
     return c
Exemplo n.º 19
0
    def forward(self, X, past):
        results = {}
        batch, sequence = utils.shape_list(X)

        wpe = Variable(torch.randn([self.n_ctx, self.n_embd]), name='wpe')
        wte = Variable(torch.randn([self.n_vocab, self.n_embd]), name='wte')

        past_length = 0 if past is None else past.shape[-2]
        h = wte[X] + wpe[self.positions_for(X, past_length)]

        # Transformer
        presents = []
        pasts = torch.unbind(
            past, dim=1) if past is not None else [None] * self.n_layer
        assert len(pasts) == self.n_layer

        for layer, past in enumerate(pasts):
            h, present = self.block(h, past=past)
            presents.append(present)
        results['present'] = torch.stack(presents, dim=1)
        h = self.norm(h)

        # Language model loss.  Do tokens <n predict token n?
        h_flat = torch.reshape(h, [batch * sequence, self.n_embd])
        logits = torch.matmul(h_flat, wte.t())
        logits = torch.reshape(logits, [batch, sequence, self.n_vocab])
        results['logits'] = logits
        return results
def block(x, scope, train=False, scale=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        a = attn(x, 'attn', nx, n_head, train=train, scale=scale)
        n = norm(x+a, 'ln_1')
        m = mlp(n, 'mlp', nx*4, train=train)
        h = norm(n+m, 'ln_2')
        return h
def mlp(x, scope, n_state, train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        act = act_fns[afn]
        h = act(conv1d(x, 'c_fc', n_state, 1, train=train))
        h2 = conv1d(h, 'c_proj', nx, 1, train=train)
        h2 = dropout(h2, resid_pdrop, train)
        return h2
Exemplo n.º 22
0
def mask_attn_weights(w):
    n = shape_list(w)[-1]
    b = tf.matrix_band_part(
        tf.ones([n, n]), -1,
        0)  ## https://www.tensorflow.org/api_docs/python/tf/linalg/band_part
    b = tf.reshape(b, [1, 1, n, n])
    w = w * b + -1e9 * (1 - b)
    return w
Exemplo n.º 23
0
def block(x, scope, train=False, scale=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        a = attn(x, "attn", nx, n_head, train=train, scale=scale)
        n = norm(x + a, "ln_1")
        m = mlp(n, "mlp", nx * 4, train=train)
        h = norm(n + m, "ln_2")
        return h
Exemplo n.º 24
0
def split_states(x, n):
    """
    reshape (batch, pixel, state) -> (batch, pixel, head, head_state)
    """
    x_shape = shape_list(x)
    m = x_shape[-1]
    new_x_shape = x_shape[:-1] + [n, m // n]
    return tf.reshape(x, new_x_shape)
Exemplo n.º 25
0
def strided_transpose(x, n_ctx, local_attn_ctx, blocksize):
    bT_ctx = n_ctx // local_attn_ctx
    assert bT_ctx % blocksize == 0, f'{bT_ctx}, {blocksize}'
    n, t, embd = shape_list(x)
    x = tf.reshape(x, [n, bT_ctx, local_attn_ctx, embd])
    x = tf.transpose(x, [0, 2, 1, 3])
    x = tf.reshape(x, [n, t, embd])
    return x
Exemplo n.º 26
0
 def multinomial_squeeze(self, logits, temperature=1.0):
     """multinomial sampling from logits."""
     logits_shape = utils.shape_list(logits)
     reshaped_logits = (tf.reshape(logits, [-1, logits_shape[-1]]) /
                        temperature)
     choices = tf.multinomial(reshaped_logits, 1)
     choices = tf.reshape(choices, logits_shape[:-1])
     return tf.to_int32(choices)
Exemplo n.º 27
0
 def mlp(self, x, scope, n_state, train=False):
     with tf.variable_scope(scope):
         nx = utils.shape_list(x)[-1]  # nx: 768 for bpe
         act = ACT_FNS[self.params.afn]  # gelu
         h = act(conv1d(x, 'c_fc', n_state, 1, train=train))
         h2 = conv1d(h, 'c_proj', nx, 1, train=train)
         h2 = dropout(h2, self.params.resid_pdrop, train)
         return h2
Exemplo n.º 28
0
 def _attn(self, q, k, v):
     w = tf.matmul(q, k)
     if self.scale:
         n_state = shape_list(v)[-1]
         w = w * tf.rsqrt(tf.cast(n_state, tf.float32))
     w = self.mask_attn_weights(w)
     w = tf.nn.softmax(w)
     a = tf.matmul(w, v)
     return a
Exemplo n.º 29
0
def split_states(x, n):
    x_shape = shape_list(x)
    m = x_shape[-1]
    """
    from [batch, n_ctx, n_embd]
    to [batch, n_ctx, n_head, n_embd//n_head]
    """
    new_x_shape = x_shape[:-1] + [n, m//n]
    return tf.reshape(x, new_x_shape)
Exemplo n.º 30
0
def clf(x,
        ny,
        w_init=tf.random_normal_initializer(stddev=0.02),
        b_init=tf.constant_initializer(0),
        train=False):
    with tf.variable_scope('clf'):
        nx = shape_list(x)[-1]
        w = tf.get_variable("w", [nx, ny], initializer=w_init)
        b = tf.get_variable("b", [ny], initializer=b_init)
        return tf.matmul(x, w) + b