def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint') parser.add_argument("--train_input_file", type=str, default='data/train.128len.db') parser.add_argument("--eval_input_file", type=str, default='./data/dummy_data.tsv') parser.add_argument("--output_dir", type=str, default='output') parser.add_argument("--seed", type=int, default=42) parser.add_argument("--max_seq_length", type=int, default=128) parser.add_argument("--skip_eval", action='store_true', help='If true, skip evaluation.') parser.add_argument("--continue_from", type=int, default=0) parser.add_argument("--train_batch_size", type=int, default=4, help="batch size now means per GPU per step") parser.add_argument( "--gradient_accumulation_steps", type=int, default=2, help="to increase effective batch size and reduce synchronization") parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_optim_steps", type=int, default=1000000, help="new API specifies num update steps") parser.add_argument("--valid_step", type=int, default=10000, help="how many optim steps between validations") parser.add_argument("--warmup_proportion", type=float, default=0.1) parser.add_argument("--warmup_steps", type=int, default=16000) parser.add_argument("--normalize_data", type=boolean_string, default=True) parser.add_argument("--fp16", type=boolean_string, default=True) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--lr_schedule", type=str, choices=['noam', 'noamwd', 'BERT', 'None'], default='noam') parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument("--no_token_id", type=boolean_string, default=True) parser.add_argument("--log_dir", type=str) parser.add_argument('--pbar', type=boolean_string, default=True, help='turn on progress bar') # distributed parser.add_argument('--local_rank', type=int, default=-1, help='for torch.distributed') args = parser.parse_args() assert args.train_batch_size % args.gradient_accumulation_steps == 0, 'batch size % gradient accumulation steps != 0!' args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) logger.info( f'train batch size = {args.train_batch_size*args.gradient_accumulation_steps}, ' 'new train batch size (after gradient accumulation) = {args.train_batch_size}' ) if args.local_rank == -1: logger.info(f'CUDA available? {str(torch.cuda.is_available())}') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() args.device, args.n_gpu = device, n_gpu else: # distributed training torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') n_gpu = torch.distributed.get_world_size() args.device, args.n_gpu = device, 1 logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)},16-bits training: {args.fp16}" ) np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) timestamp = datetime.datetime.now().strftime('%Y-%m-%d%H%M%S') output_dir = join( args.output_dir, 'GPT2.{}.{}.{}gpu.{}'.format(args.learning_rate, args.train_batch_size, n_gpu, timestamp)) log_dir = args.log_dir if args.log_dir is not None and len( args.log_dir) > 0 else output_dir if args.local_rank == -1 or torch.distributed.get_rank() == 0: os.makedirs(output_dir, exist_ok=True) train_logger = open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) eval_logger = open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1) print( 'epoch,global_step,step,mean_loss,n_token_real,n_token_total,epoch_time', file=train_logger) print('epoch,global_step,step,eval_loss', file=eval_logger) tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) config = GPT2Config.from_pretrained(args.model_name_or_path) if args.local_rank == -1: train_dataloader = BucketingDataLoader(args.train_input_file, args.train_batch_size, args.max_seq_length) else: train_dataloader = DistributedBucketingDataLoader( torch.distributed.get_rank(), torch.distributed.get_world_size(), args.train_input_file, args.train_batch_size, args.max_seq_length) model = GPT2LMHeadModel.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) model = model.to(args.device) global_step, tr_loss = train(args, train_dataloader, model, tokenizer, train_logger, eval_logger)
logger.info('Input Argument Information') args_dict = vars(args) for a in args_dict: logger.info('%-28s %s' % (a, args_dict[a])) ######################################################################### # Prepare Data Set ########################################################################## enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) config = GPT2Config.from_json_file(join(args.model_name_or_path, 'config.json')) if args.local_rank == -1: train_dataloader = BucketingDataLoader(args.train_input_file, args.train_batch_size, args.max_seq_length) else: pass # train_dataloader = DistributedBucketingDataLoader( # get_rank(), get_world_size(), # args.train_input_file, args.train_batch_size, # args.max_seq_length) eval_dataloader_loss = DynamicBatchingLoader(args.eval_input_file, enc, args.normalize_data, args.eval_batch_size, args.max_seq_length) eval_dataloader_gen = get_eval_list_same_length(args.eval_input_file, enc, args.eval_batch_size, True)
print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger) filenames = os.listdir(args.init_checkpoint) filenames = [f for f in filenames if f.endswith('.pkl')] filenames = sorted(filenames, key=lambda x: int(x[18:x.index('.')])) for filename in tqdm(filenames): global_step = int(filename[18:filename.index('.')]) model_path = os.path.join(args.init_checkpoint, filename) model, vocab = get_kogpt2_model(model_path, VOCAB_PATH, 0) if args.fp16: logger.info('in fp16, model.half() activated') model.half() if args.n_gpu > 1: logging.info('data parallel because more than one gpu') model = torch.nn.DataParallel(model) eval_dataloader_loss = BucketingDataLoader(args.eval_input_file, args.eval_batch_size, args.max_seq_length) if args.local_rank == -1 or get_rank() == 0: eval_loss, eval_ppl = eval_model_loss(model, enc, eval_dataloader_loss, args) print(f'{global_step + 1},{eval_loss},{eval_ppl}', file=eval_logger) if args.local_rank == -1 or get_rank() == 0: train_logger.close() eval_logger.close()