config.hidden_size = 768 config.intermediate_size = 3072 config.max_position_embeddings = 512 config.vocab_size = 32000 logger.info("USE_NSP: {}".format(USE_NSP)) if USE_NSP: model = BertForPreTraining(config) else: model = BertForPreTrainingWithoutNSP(config) model.to(device) logger.info(config) logger.info(model) optimizer = AdamW(model.parameters(), lr=2e-5) model.train() train_losses = [] for i in range(1, MAX_STEPS + 1): optimizer.zero_grad() sent_pairs = create_sent_pairs(sents_list, batch_size=BATCH_SIZE) encoded = encode_sent_pairs(sent_pairs) res = model( encoded["input_ids"].to(device), token_type_ids=None, attention_mask=encoded["attention_mask"].to(device), labels=encoded["labels"].to(device), next_sentence_label=encoded["next_sentence_label"].to(device), ) loss = res.loss if i % 100 == 0:
else: config = BertConfig.from_json_file('bert_config.json') #config = BertConfig.from_json_file('bert_config.json') # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) vocab_size=config.vocab_size #tokenizer = BertTokenizer.from_pretrained(pretrained_path) #model = BertForPreTraining.from_pretrained(pretrained_path) model = BertForPreTraining(config) if args.cuda: model.cuda() optimizer = AdamW(model.parameters(), lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. ) #optimizer = optim.SGD(model.parameters(), lr=2e-5) compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Average) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0)
sop_metric(logits=seq_relationship_score.view(-1, 2), target=is_next.view(-1)) if Config.gradient_accumulation_steps > 1: loss = loss / Config.gradient_accumulation_steps loss.backward() nb_tr_steps += 1 tr_mask_acc.update(mask_metric.value(), n=input_ids.size(0)) tr_sop_acc.update(sop_metric.value(), n=input_ids.size(0)) tr_loss.update(loss.item(), n=1) tr_mask_loss.update(masked_lm_loss.item(), n=1) tr_sop_loss.update(next_sentence_loss.item(), n=1) if (step + 1) % Config.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % Config.num_save_steps == 0: model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join( Config.output_dir, 'pytorch_model_epoch{}.bin'.format(global_step)) torch.save(model_to_save.state_dict(), output_model_file) # save config
def train(args): if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) if args.gpu != '-1' and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_rng_state(torch.cuda.get_rng_state()) torch.backends.cudnn.deterministic = True else: device = torch.device('cpu') config = { 'train': { 'unchanged_variable_weight': 0.1, 'buffer_size': 5000 }, 'encoder': { 'type': 'SequentialEncoder' }, 'data': { 'vocab_file': 'data/vocab.bpe10000/vocab' } } train_set = Dataset('data/preprocessed_data/train-shard-*.tar') dev_set = Dataset('data/preprocessed_data/dev.tar') vocab = Vocab.load('data/vocab.bpe10000/vocab') if args.decoder: vocab_size = len(vocab.all_subtokens) + 1 else: vocab_size = len(vocab.source_tokens) + 1 max_iters = args.max_iters lr = args.lr warm_up = args.warm_up batch_size = 4096 effective_batch_size = args.batch_size max_embeds = 1000 if args.decoder else 512 bert_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_embeds, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) model = BertForPreTraining(bert_config) if args.restore: state_dict = torch.load(os.path.join(args.save_dir, args.res_name)) model.load_state_dict(state_dict['model']) batch_count = state_dict['step'] epoch = state_dict['epoch'] model.train() model.to(device) if len(args.gpu) > 1 and device == torch.device('cuda'): model = nn.DataParallel(model) def lr_func(step): if step > warm_up: return (max_iters - step) / (max_iters - warm_up) else: return (step / warm_up) optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-6, weight_decay=0.01) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_func, last_epoch=-1) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') if args.restore: optimizer.load_state_dict(state_dict['optim']) scheduler.load_state_dict(state_dict['scheduler']) batch_count = 0 epoch = 0 cum_loss = 0.0 while True: # load training dataset, which is a collection of ASTs and maps of gold-standard renamings train_set_iter = train_set.batch_iterator( batch_size=batch_size, return_examples=False, config=config, progress=True, train=True, max_seq_len=512, num_readers=args.num_readers, num_batchers=args.num_batchers) epoch += 1 print("Epoch {}".format(epoch)) loss = 0 num_seq = 0 optimizer.zero_grad() for batch in train_set_iter: if args.decoder: input_ids = batch.tensor_dict['prediction_target'][ 'src_with_true_var_names'] else: input_ids = batch.tensor_dict['src_code_tokens'] attention_mask = torch.ones_like(input_ids) attention_mask[input_ids == 0] = 0.0 assert torch.max(input_ids) < vocab_size assert torch.min(input_ids) >= 0 if input_ids.shape[0] > max_embeds: print( "Warning - length {} is greater than max length {}. Skipping." .format(input_ids.shape[0], max_embeds)) continue input_ids, labels = mask_tokens(inputs=input_ids, mask_token_id=vocab_size - 1, vocab_size=vocab_size, mlm_probability=0.15) input_ids[attention_mask == 0] = 0 labels[attention_mask == 0] = -100 if torch.cuda.is_available(): input_ids = input_ids.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() outputs = model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) unreduced_loss = loss_fn( outputs[0].view(-1, bert_config.vocab_size), labels.view(-1)).reshape(labels.shape) / ( torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7) loss += unreduced_loss.sum() num_seq += input_ids.shape[0] if num_seq > effective_batch_size: batch_count += 1 loss /= num_seq cum_loss += loss.item() if batch_count % 20 == 0: print("{} batches, Loss : {:.4}, LR : {:.6}".format( batch_count, cum_loss / 20, scheduler.get_lr()[0])) cum_loss = 0.0 if batch_count % 10000 == 0: fname1 = os.path.join( args.save_dir, 'bert_{}_step_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) fname2 = os.path.join( args.save_dir, 'bert_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) state = { 'epoch': epoch, 'step': batch_count, 'model': model.module.state_dict(), 'optim': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(state, fname1) torch.save(state, fname2) print("Saved file to path {}".format(fname1)) print("Saved file to path {}".format(fname2)) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() loss = 0 num_seq = 0 if batch_count == max_iters: print(f'[Learner] Reached max iters', file=sys.stderr) exit() print("Max_len = {}".format(max_len)) break