def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size dropout_prob = args.hidden_dropout input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): overall_name = f'emb_np-{nproc}_vs-{vocab_size}' profiler.start(overall_name) # Forward pass profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}') embedding_output = embedding.forward(input_indices, position_indices) train_loss = torch.mean(embedding_output) torch.cuda.synchronize() profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}') # Backward pass profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}') optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}') profiler.stop(overall_name)
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data args = get_args() sequence_length = 1024 vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(args.batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (args.batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=args.hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores transformer = ParallelTransformer( attention_mask_func=gpt2_attention_mask_func, num_layers=args.num_layers, hidden_size=args.hidden_size, layernorm_epsilon=args.layernorm_epsilon, num_attention_heads=args.num_attention_heads, attention_dropout=0.1, hidden_dropout=0.1) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( input_indices, vocab_size - 1) transformer_output = transformer.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training self attention layer...') # Use fake train data args = get_args() batch_size = 32 sequence_length = 1024 hidden_size = args.hidden_size vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}') print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}') attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores self_attention = ParallelSelfAttention( attention_mask_func=gpt2_attention_mask_func, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, attention_dropout=0.1 ) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}') self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training ParallelTransformerLayer...') batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size hidden_dropout = args.hidden_dropout attention_dropout = args.attention_dropout num_layers = args.num_layers layernorm_epsilon = args.layernorm_epsilon num_attention_heads = args.num_attention_heads input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) labels = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) labels = labels.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=hidden_dropout, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) transformer_layer = ParallelTransformerLayer( attention_mask_func=gpt2_attention_mask_func, layer_number=0, hidden_size=hidden_size, layernorm_epsilon=layernorm_epsilon, num_attention_heads=num_attention_heads, attention_dropout=attention_dropout, hidden_dropout=hidden_dropout) # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) attention_mask = (torch.randint( low=0, high=2, size=(sequence_length, divide(num_attention_heads, torch.distributed.get_world_size()), batch_size, batch_size)) < 0).cuda() optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): input_ = torch.rand(size=embedding_output.size()).cuda() overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(overall_name) fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' # Forward pass profiler.start(fname) loss = transformer_layer.forward(input_, attention_mask) train_loss = torch.mean(loss) # print(train_loss) torch.cuda.synchronize() profiler.stop(fname) # Backward pass bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(bname) optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(bname) profiler.stop(overall_name)