Exemplo n.º 1
0
    def __call__(self):
        with distribute.data_placement_scope():
            x = flow.data.megatron_gpt_mmap_data_loader(
                data_file_prefix=self.dataset,
                seq_length=self.seq_length,
                num_samples=self.num_samples,
                batch_size=self.batch_size,
                dtype=flow.int64,
                shuffle=True,
                random_seed=self.seed,
                split_sizes=self.split,
                split_index=0,
                nd_sbp=distribute.get_data_parallel_dist(),
                name=self.name,
            )

        # embedding is on pipeline first stage
        with distribute.layer_placement_scope(0):
            data = flow.slice(x, begin=(None, 0), size=(None, self.seq_length))

        # loss is on pipeline last stage
        with distribute.layer_placement_scope(-1):
            labels = flow.slice(x,
                                begin=(None, 1),
                                size=(None, self.seq_length))

        return data, labels
Exemplo n.º 2
0
def SQuAD(
    input_ids_blob,
    input_mask_blob,
    token_type_ids_blob,
    vocab_size,
    seq_length=512,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=16,
    initializer_range=0.02,
    ):

    backbone = bert_util.BertBackbone(
        input_ids_blob=input_ids_blob,
        input_mask_blob=input_mask_blob,
        token_type_ids_blob=token_type_ids_blob,
        vocab_size=vocab_size,
        seq_length=seq_length,
        hidden_size=hidden_size,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        intermediate_size=intermediate_size,
        hidden_act=hidden_act,
        hidden_dropout_prob=hidden_dropout_prob,
        attention_probs_dropout_prob=attention_probs_dropout_prob,
        max_position_embeddings=max_position_embeddings,
        type_vocab_size=type_vocab_size,
        initializer_range=initializer_range,
    )

    with flow.scope.namespace("cls-squad"):
        final_hidden = backbone.sequence_output()
        final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size])
        logits = bert_util._FullyConnected(
                    final_hidden_matrix,
                    hidden_size,
                    units=2,
                    weight_initializer=bert_util.CreateInitializer(initializer_range),
                    name='output')
        logits = flow.reshape(logits, [-1, seq_length, 2])

        start_logits = flow.slice(logits, [None, None, 0], [None, None, 1])
        end_logits = flow.slice(logits, [None, None, 1], [None, None, 1])

    return start_logits, end_logits
Exemplo n.º 3
0
    def query_key_value(self, h):
        """
        Split input to q, k, v and split hidden states into heads,
            shape: (batch_size, seq_length, hidden_size)
                -> (batch_size, seq_length, num_attn_heads, head_size)
                -> (batch_size, num_attn_heads, seq_length, head_size)
        """
        assert len(h.shape) == 3

        # Note: 3 is between num_heads and head_size that ensure the features of heads of q, k, v is contiguously arranged
        new_shape = (
            h.shape[0],
            h.shape[1],
            self.num_heads,
            3 * self.head_size,
        )
        if h.shape[0] == self.seq_length and h.shape[1] == self.batch_size:
            perm = [1, 2, 0, 3]
        elif h.shape[0] == self.batch_size and h.shape[1] == self.seq_length:
            perm = [0, 2, 1, 3]
        else:
            raise ValueError

        h = flow.reshape(h, new_shape)
        q, k, v = (flow.transpose(
            flow.slice(
                h,
                begin=[None, None, None, i * self.head_size],
                size=[None, None, None, self.head_size],
            ),
            perm=perm,
        ) for i in range(3))
        return q, k, v
Exemplo n.º 4
0
        def train(x: flow.typing.Numpy.Placeholder(
            (args.global_batch_size, args.seq_length + 1), dtype=flow.int64)):
            x = distribute.input_data_parallel_cast(x)
            with distribute.layer_placement_scope(0):
                data = flow.slice(x,
                                  begin=(None, 0),
                                  size=(None, args.seq_length))
            with distribute.layer_placement_scope(-1):
                labels = flow.slice(x,
                                    begin=(None, 1),
                                    size=(None, args.seq_length))

            logits = model(data)
            losses = loss(logits, labels)
            optimizer.minimize(losses)

            losses = distribute.output_parallel_cast(losses)
            return {"loss": losses}
Exemplo n.º 5
0
def PooledOutput(sequence_output, hidden_size, initializer_range):
    with flow.scope.namespace("bert-pooler"):
        first_token_tensor = flow.slice(sequence_output, [None, 0, 0],
                                        [None, 1, -1])
        first_token_tensor = flow.reshape(first_token_tensor,
                                          [-1, hidden_size])
        pooled_output = bert_util._FullyConnected(
            first_token_tensor,
            input_size=hidden_size,
            units=hidden_size,
            weight_initializer=bert_util.CreateInitializer(initializer_range),
            name="dense",
        )
        pooled_output = flow.math.tanh(pooled_output)
    return pooled_output
Exemplo n.º 6
0
Arquivo: bert.py Projeto: zzk0/oneflow
def _EmbeddingPostprocessor(
    input_blob,
    seq_length,
    embedding_size,
    use_token_type=False,
    token_type_ids_blob=None,
    token_type_vocab_size=16,
    token_type_embedding_name="token_type_embeddings",
    use_position_embeddings=True,
    position_embedding_name="position_embeddings",
    initializer_range=0.02,
    max_position_embeddings=512,
    dropout_prob=0.1,
):
    output = input_blob

    if use_token_type:
        assert token_type_ids_blob is not None
        token_type_table = flow.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, embedding_size],
            dtype=input_blob.dtype,
            initializer=CreateInitializer(initializer_range),
        )
        token_type_embeddings = flow.gather(params=token_type_table,
                                            indices=token_type_ids_blob,
                                            axis=0)
        output = output + token_type_embeddings

    if use_position_embeddings:
        position_table = flow.get_variable(
            name=position_embedding_name,
            shape=[1, max_position_embeddings, embedding_size],
            dtype=input_blob.dtype,
            initializer=CreateInitializer(initializer_range),
        )
        assert seq_length <= max_position_embeddings
        if seq_length != max_position_embeddings:
            position_table = flow.slice(position_table,
                                        begin=[None, 0, 0],
                                        size=[None, seq_length, -1])
        output = output + position_table

    output = _LayerNorm(output, embedding_size)
    output = _Dropout(output, dropout_prob)

    return output
Exemplo n.º 7
0
 def self_attn_qk_v_fw_bw(
     h: flow.typing.Numpy.Placeholder(
         shape=(seq_len, batch_size, hidden_size), dtype=flow.float32
     )
 ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]:
     var = flow.get_variable(
         "var",
         shape=(1,),
         dtype=flow.float32,
         initializer=flow.constant_initializer(1.0, dtype=flow.float32),
         trainable=True,
     )
     h = h * var
     if fused:
         flow.watch_diff(h, test_global_storage.Setter("h_grad_fused"))
     else:
         flow.watch_diff(h, test_global_storage.Setter("h_grad"))
     if fp16:
         h = flow.amp_white_identity(h)
     alpha = get_alpha(head_size)
     if fused:
         (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value(
             h, head_size=head_size, alpha=alpha
         )
     else:
         h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size))
         (q, k, v) = (
             flow.transpose(
                 flow.slice(
                     h,
                     begin=[None, None, None, head_size * i],
                     size=[None, None, None, head_size],
                 ),
                 perm=[1, 2, 0, 3],
             )
             for i in range(3)
         )
         qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha)
     h = flow.matmul(qmk, v)
     loss = flow.math.reduce_sum(h)
     flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss)
     return (qmk, v)