def main(): """Main training program.""" print('Generate Samples') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) #get the tokenizer tokenizer = prepare_tokenizer(args) # Model, optimizer, and learning rate. model = setup_model(args) #setting default batch size to 1 args.batch_size = 1 #generate samples generate_samples(model, tokenizer, args, torch.cuda.current_device())
def prepare_model(): """Main training program.""" #print('Generate Samples') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = set_args() #print(args) args.mem_length = args.seq_length + args.mem_length - 1 # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. args.seed = random.randint(0, 1000000) set_random_seed(args.seed) #get the tokenizer tokenizer = prepare_tokenizer(args) # Model, optimizer, and learning rate. model = setup_model(args) #args.load="../ckp/txl-2.8b11-20-15-10" #model2=setup_model(args) #setting default batch size to 1 args.batch_size = 1 #generate samples return model, tokenizer, args
def main(): """Main training program.""" print('Generate Samples') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) #get the tokenizer tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'merges.txt'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # Model, optimizer, and learning rate. model = setup_model_and_optimizer(args)[0] #setting default batch size to 1 args.batch_size = 1 #generate samples generate_samples(model, tokenizer, args, torch.cuda.current_device())
def main(): """Main training program.""" print('Generate Samples') # Disable CuDNN. # torch.backends.cudnn.enabled = False torch.backends.cudnn.enabled = True # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = prepare_tokenizer(args) # Model, optimizer, and learning rate. model = setup_model(args) # setting default batch size to 1 # args.batch_size = 1 args.device = torch.cuda.current_device() # generate samples if args.num_samples == 0: args.batch_size = 1 if args.sample_input_file != "": generate_samples_input_from_file(model, tokenizer, args) else: generate_samples_interactive(model, tokenizer, args) else: write_and_generate_samples_unconditional(model, tokenizer, args)
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # get the tokenizer tokenizer = GPT2Tokenizer(os.path.join(args.tokenizer_path, 'vocab.json'), os.path.join(args.tokenizer_path, 'chinese_vocab.model')) # load train data if args.do_train: train_dataloader, _ = load_data(args, 'train', tokenizer, 1) dev_dataloader, dev_dataset = load_data(args, 'dev', tokenizer, 1) with open(args.deepspeed_config, "r") as f: deepspeed_conf = json.load(f) epoch = args.epoch grad_acc = deepspeed_conf["gradient_accumulation_steps"] args.train_iters = len(train_dataloader) * epoch / grad_acc # Model, optimizer, and learning rate. # TODO: maybe need to reinitialize optimizer elif args.do_eval: # Set an arbitrary positive integer since the optimizer and the scheduler will not be used when do eval. args.train_iters = 1 model, optimizer, lr_scheduler = setup_model_and_optimizer_C(args) device = torch.cuda.current_device() # give a time stemp to the model cur_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) results_dir = os.path.join(args.results_dir, "{}-{}".format(args.model_name, cur_time)) os.makedirs(results_dir, exist_ok=True) if args.do_train and torch.distributed.get_rank() == 0: with open(os.path.join(results_dir, "train_log.txt"), "w") as f: f.write("Train losses:\n") with open(os.path.join(results_dir, "dev_log.txt"), "w") as f: f.write("Dev accs:\n") torch.distributed.barrier() if args.do_train: # cand_ids = torch.tensor(dev_dataset.cand_ids).to(device) total_loss, logging_loss, best_acc = 0.0, 0.0, 0.0 global_step, total_step, best_step = 0, 0, 0 for e in range(epoch): model.train() for batch, no_model_batch in tqdm(train_dataloader, disable=(torch.distributed.get_rank() != 0)): for k in batch: batch[k] = batch[k].to(device) for k in no_model_batch: no_model_batch[k] = no_model_batch[k].to(device) output = model(**batch) # get the loss of the last token output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1) # get the label of the last token # labels = no_model_batch["labels"].float() labels = no_model_batch["truth"].float() # labels = (torch.sum(labels * no_model_batch["loss_mask"], 1) / torch.sum(no_model_batch["loss_mask"], -1)).long() # cross_entropy loss # losses = mpu.vocab_parallel_cross_entropy(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) losses = CrossEntropyLoss(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1)) loss = torch.mean(losses) model.backward(loss) model.step() torch.distributed.all_reduce(loss.data, group=mpu.get_data_parallel_group()) loss.data = loss.data / mpu.get_data_parallel_world_size() total_loss += loss.item() / grad_acc if total_step % grad_acc == 0: global_step += 1 if global_step != 0 and global_step % args.log_interval == 0: # logging if torch.distributed.get_rank() == 0: train_log = "Epoch {}, global step {}, total step {}, train lm loss: {}".format(e, global_step, epoch * len(train_dataloader), (total_loss - logging_loss) / args.log_interval) yprint(train_log) with open(os.path.join(results_dir, "train_log.txt"), "a") as f: f.write(train_log + "\n") logging_loss = total_loss if global_step != 0 and global_step % args.eval_interval == 0: # evaluate on the dev acc, _, _ = evaluate_tnews(args, model, dev_dataloader, device, mode="dev") dev_results_dir = os.path.join(results_dir, "dev_step-{}".format(global_step)) if acc > best_acc: best_acc = acc best_step = global_step if torch.distributed.get_rank() == 0: # we will only write the log file once dev_log = "Epoch: {}, Global step: {}, Acc: {}".format(e, global_step, acc) yprint(dev_log) os.makedirs(dev_results_dir, exist_ok=True) with open(os.path.join(dev_results_dir, "dev_result.txt"), "w") as f: f.write(dev_log + "\n") with open(os.path.join(results_dir, "dev_log.txt"), "a") as f: f.write(dev_log + "\n") torch.distributed.barrier() args.save = dev_results_dir save_checkpoint(global_step, model, optimizer, lr_scheduler, args) total_step += 1 with open(os.path.join(dev_results_dir, "dev_log.txt"), "a") as f: f.write("Best acc: {} Best step: {}\n".format(best_acc, best_step)) if args.do_eval: # evaluate on the test test_dataloader, test_dataset = load_data(args, 'test', tokenizer, 1) cand_ids = torch.tensor(test_dataset.cand_ids).to(device) if args.do_train: # if do training, then evaluate the one with the max acc on dev set. eval_ckpt_path = os.path.join(results_dir, "dev_step-{}".format(best_step)) args.load = eval_ckpt_path else: # if only do eval, then evaluate the one specified by the user. args.load = args.eval_ckpt_path load_checkpoint(model=model, optimizer=None, lr_scheduler=None, args=args) acc, _, _ = evaluate(args, model, test_dataloader, cand_ids, device, mode="test") if torch.distributed.get_rank() == 0: eval_log = "Checkpoint from {}: Acc: {}".format(args.load, acc) yprint(eval_log) with open(os.path.join(results_dir, "eval_log"), "w") as f: f.write(eval_log + "\n") torch.distributed.barrier()