Python Embedding.forward примеры использования

Язык программирования: Python

Пространство имен/Пакет: model.embedding

Класс/Тип: Embedding

Метод/Функция: forward

Примеров на hotexamples.com: 4

Python Embedding.forward - 4 примера найдено. Это лучшие примеры Python кода для model.embedding.Embedding.forward, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Embedding(16)

forward(4)

add_dim(1)

Основные методы

Embedding (16)

forward (4)

add_dim (1)

Пример #1

Показать файл

Файл: train_embeddings.py Проект: thisisalbertliang/AutoMP

def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    dropout_prob = args.hidden_dropout

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()

    for epoch in range(num_epochs):
        overall_name = f'emb_np-{nproc}_vs-{vocab_size}'
        profiler.start(overall_name)

        # Forward pass
        profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}')
        embedding_output = embedding.forward(input_indices, position_indices)
        train_loss = torch.mean(embedding_output)
        torch.cuda.synchronize()
        profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}')

        # Backward pass
        profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}')
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}')

        profiler.stop(overall_name)

Пример #2

Показать файл

def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    args = get_args()
    sequence_length = 1024
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(args.batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (args.batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=args.hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)

    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    transformer = ParallelTransformer(
        attention_mask_func=gpt2_attention_mask_func,
        num_layers=args.num_layers,
        hidden_size=args.hidden_size,
        layernorm_epsilon=args.layernorm_epsilon,
        num_attention_heads=args.num_attention_heads,
        attention_dropout=0.1,
        hidden_dropout=0.1)

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        input_indices, vocab_size - 1)

    transformer_output = transformer.forward(hidden_states=embedding_output,
                                             attention_mask=attention_mask)
    print_rank_0(f'AutoMP: transformer_output = {transformer_output}')

Пример #3

Показать файл

Файл: train_attention.py Проект: thisisalbertliang/AutoMP

def train():
    
    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training self attention layer...')
    # Use fake train data
    args = get_args()
    batch_size = 32
    sequence_length = 1024
    hidden_size = args.hidden_size
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
    embedding = Embedding(hidden_size=hidden_size, 
              vocab_size=vocab_size, 
              max_sequence_length=sequence_length, 
              embedding_dropout_prob=dropout_prob, 
              init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)
    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):

        print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}')
        print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}')

        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    self_attention = ParallelSelfAttention(
        attention_mask_func=gpt2_attention_mask_func, 
        hidden_size=args.hidden_size, 
        num_attention_heads=args.num_attention_heads, 
        attention_dropout=0.1
    )

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)

    print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}')

    self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask)
    print_rank_0(f'AutoMP: self_att_output = {self_att_output}')

Пример #4

Показать файл

Файл: train_transformer_layer.py Проект: thisisalbertliang/AutoMP

def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training ParallelTransformerLayer...')

    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    hidden_dropout = args.hidden_dropout
    attention_dropout = args.attention_dropout
    num_layers = args.num_layers
    layernorm_epsilon = args.layernorm_epsilon
    num_attention_heads = args.num_attention_heads

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    labels = torch.randint(low=0,
                           high=vocab_size,
                           size=(batch_size, sequence_length))
    labels = labels.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=hidden_dropout,
                          init_method=init_method_normal)
    embedding_output = embedding.forward(input_indices, position_indices)

    transformer_layer = ParallelTransformerLayer(
        attention_mask_func=gpt2_attention_mask_func,
        layer_number=0,
        hidden_size=hidden_size,
        layernorm_epsilon=layernorm_epsilon,
        num_attention_heads=num_attention_heads,
        attention_dropout=attention_dropout,
        hidden_dropout=hidden_dropout)

    # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)
    attention_mask = (torch.randint(
        low=0,
        high=2,
        size=(sequence_length,
              divide(num_attention_heads, torch.distributed.get_world_size()),
              batch_size, batch_size)) < 0).cuda()

    optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()
    for epoch in range(num_epochs):
        input_ = torch.rand(size=embedding_output.size()).cuda()

        overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(overall_name)

        fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        # Forward pass
        profiler.start(fname)
        loss = transformer_layer.forward(input_, attention_mask)
        train_loss = torch.mean(loss)
        # print(train_loss)
        torch.cuda.synchronize()
        profiler.stop(fname)
        # Backward pass
        bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(bname)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(bname)

        profiler.stop(overall_name)