def __init__(self, config, lr, num_warmup_steps, num_training_steps): super().__init__() self.model = transformers.GPT2LMHeadModel(config) self._config = config self._lr = lr self._num_warmup_steps = num_warmup_steps self._num_training_steps = num_training_steps
def __init__(self, vocab: nnlp.Vocab, n_embd: int = 256, n_layer: int = 2, n_head: int = 2, n_position: int = 128, n_ctx: int = 128, unk_hard_loss: float = -1.0): super(BiGPT2LM, self).__init__() config = transformers.GPT2Config(vocab_size=len(vocab), n_embd=n_embd, n_layer=n_layer, n_head=n_head, n_positions=n_position, n_ctx=n_ctx, output_hidden_states=True) self.gpt2model_fwd = transformers.GPT2LMHeadModel(config) self.gpt2model_rev = transformers.GPT2LMHeadModel(config) self.vocab = vocab self.unk_hard_loss = unk_hard_loss
def get_model(tokenizer, resume=False): if cfg('random_init'): # load randomly initialized model instead of pretrained model_config = transformers.GPT2Config() model = transformers.GPT2LMHeadModel(model_config) elif resume: # resume from previous best model = AutoModelForCausalLM.from_pretrained( cfg('out_path') + cfg('name')) else: # load pretrained model model = AutoModelForCausalLM.from_pretrained(cfg('model')) model.resize_token_embeddings(len(tokenizer)) model = model.to(cfg('device')) return model
def __init__(self, vocab: nnlp.Vocab, n_embd: int = 256, n_layer: int = 4, n_head: int = 4, n_position: int = 128, n_ctx: int = 128): super(GPT2Wrap, self).__init__() config = transformers.GPT2Config(vocab_size=len(vocab), n_embd=n_embd, n_layer=n_layer, n_head=n_head, n_positions=n_position, n_ctx=n_ctx, output_hidden_states=True) self.gpt2_model = transformers.GPT2LMHeadModel(config) self.vocab = vocab self.n_vocab = len(vocab)
def create_model(hparams, dictionary): # Config docs: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2config model = transformers.GPT2LMHeadModel( transformers.GPT2Config(vocab_size=len(dictionary), n_embd=hparams["embedding_dim"], n_layer=hparams["n_layer"], n_head=hparams["n_head"], n_positions=hparams['max_seq_length'], n_ctx=hparams['max_seq_length'])) if hparams["load_checkpoint"]: model.load_state_dict( torch.load(hparams["load_checkpoint"], map_location=lambda storage, location: storage)) if hparams["use_multi_gpu"]: assert torch.cuda.device_count() > 1 print("Using %d GPUs" % torch.cuda.device_count()) model = torch.nn.DataParallel(model) optim = torch.optim.Adam(model.parameters(), lr=hparams["lr"]) return model, optim
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='設定要使用的顯卡,以逗號區隔') parser.add_argument('--model_config', type=str, required=False, help='模型參數設定檔的路徑') parser.add_argument('--tokenizer_path', type=str, required=True, help='選擇字典檔的路徑') parser.add_argument('--raw_data_path', type=str, required=True, help='訓練用語料庫的路徑') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='語料庫 Tokenized 後的存放路徑') parser.add_argument('--raw', action='store_true', help='是否已做過 Tokenization') parser.add_argument('--epochs', default=5, type=int, required=False, help='設定 Epochs') parser.add_argument('--batch_size', default=8, type=int, required=False, help='設定 Batch Size') parser.add_argument('--lr', default=3e-5, type=float, required=False, help='設定 Learning Rate') parser.add_argument('--warmup_steps', default=0.1, type=float, required=False, help='設定 Warmup Steps 的比例') parser.add_argument('--log_step', default=1, type=int, required=False, help='Loss 紀錄的間隔,必須是 Gradient Accumulation 的整數倍') parser.add_argument('--stride', default=768, type=int, required=False, help='設定訓練語料庫的窗口大小') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度累積') parser.add_argument('--fp16', action='store_true', help='是否使用半精度浮點數') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='將訓練語料庫分成多少份') parser.add_argument('--min_length', default=1, type=int, required=False, help='文章最短長度,若文章長度不足將被捨棄') parser.add_argument('--output_dir', type=str, required=True, help='模型輸出路徑') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起始路徑') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard 輸出路徑') parser.add_argument('--segment', action='store_true', help='是否以詞為單位') parser.add_argument('--bpe_token', action='store_true', help='使用 Byte Pair Encoding') parser.add_argument('--encoder_json', default='tokenizations/encoder.json', type=str, help='encoder.json') parser.add_argument('--vocab_bpe', default='tokenizations/vocab.bpe', type=str, help='vocab.bpe') parser.add_argument('--timezone', default=8, type=int, help='手動指定時區,預設為 GMT+8') parser.add_argument('--epoch_save', default=1, type=int, help='每隔幾個 Epoch 就存一次權重') args = parser.parse_args() print(f'Arguments: {args.__repr__()}') if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert # 設定要使用的顯卡 os.environ['CUDA_VISIBLE_DEVICES'] = args.device model_config = transformers.GPT2Config.from_json_file(args.model_config) print(f'Config:\n{model_config.to_json_string()}') n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path, do_lower_case=False, do_basic_tokenize=False) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Using Device: {device.upper()}') raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation # 不支援半精度浮點數的顯卡不要使用 fp16 = args.fp16 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tz = args.timezone strlen = lambda n: len(str(n)) get_time = lambda: datetime.utcnow() + timedelta(hours=tz) tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 os.makedirs(output_dir, exist_ok=True) if raw: print('Building from Raw Data') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, tokenizer=full_tokenizer, min_length=min_length) if not args.pretrained_model: model = transformers.GPT2LMHeadModel(config=model_config) else: model = transformers.GPT2LMHeadModel.from_pretrained( args.pretrained_model) if torch.cuda.device_count() == 2: device_map = { 0: [0, 1, 2, 3, 4], 1: [5, 6, 7, 8, 9, 10, 11], } model.parallelize(device_map) # model.parallelize() print('Model Parallelism!') model.train() if torch.cuda.device_count() < 2: model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print(f'Number of Parameters: {num_parameters}') multi_gpu = False full_len = 0 print('Calculating Total Steps') for i in tqdm(range(num_pieces)): _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt') with open(_fpath, 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) warmup_steps = int(total_steps * warmup_steps) print(f'Total Steps: {total_steps}') optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.' ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # if torch.cuda.device_count() > 1: # print(f'Using {torch.cuda.device_count()} GPUs') # model = DataParallel( # model, device_ids=[int(i) for i in args.device.split(',')]) # model.to(f'cuda:{model.device_ids[0]}') # multi_gpu = True with TimeCost('Training'): print('Training Begin') overall_step = 0 running_loss = 0 for epoch in range(epochs): now = get_time() print(f'Epoch {epoch + 1} - Time: {now}') x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt') with open(_fpath, 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): idx = len(tokens) - n_ctx samples.append(tokens[idx:]) print(f'Tokenize {i} Sample Size: {len(samples)}') random.shuffle(samples) # 捨棄最後一個不足一個完整 Batch 的 Step _steps = len(samples) // batch_size # 若 Samples 數量小於 Batch Size 會發生完全沒有 Steps 可以訓練的問題 # 不要把 num_pieces 設定的太大,也可以解決這個問題 _steps = 1 if _steps <= 0 else _steps for step in range(_steps): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) _device = 'cuda:0' if torch.cuda.device_count( ) > 1 else device batch_inputs = torch.tensor(batch_inputs).long().to( _device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, _ = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar( 'loss', loss.item() * gradient_accumulation, overall_step) ts = datetime.utcnow() + timedelta(hours=8) ts = ts.strftime('%H:%M:%S') display_loss = running_loss * gradient_accumulation display_loss /= log_step / gradient_accumulation print( f'Time {ts} - ' f'Epoch {epoch + 1:{strlen(epochs)}d}/{epochs} - ' f'Step {step + 1:{strlen(_steps)}d}/{_steps} - ' f'Piece {piece_num + 1:{strlen(num_pieces)}d}/{num_pieces} - ' f'Loss {display_loss:.4f}') running_loss = 0 overall_step += 1 piece_num += 1 if (epoch + 1) % args.epoch_save == 0: print(f'Saving Model of Epoch {epoch + 1}') model_output_dir = os.path.join(output_dir, f'model_epoch{epoch + 1}') os.makedirs(model_output_dir, exist_ok=True) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_output_dir) then = get_time() print(f'Epoch {epoch + 1} Finished - Time: {then}') delta = (then - now).total_seconds() mm, ss = delta // 60, delta % 60 hh, mm = mm // 60, mm % 60 print( f'Time Cost of the Epoch {epoch + 1} - {hh:.0f}:{mm:.0f}:{ss:.2f}' ) print('Training Done') model_output_dir = os.path.join(output_dir, 'final_model') os.makedirs(model_output_dir, exist_ok=True) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_output_dir)
def __init__( self, tokenizer_model, train_file, valid_file, test_file, from_pretrained=None, block_size=1024, # [Model config] # for small n_layer=12, n_head=12, n_embd=768, # for medium -> n_layer=24, n_head=16, n_embd=1024 # for large -> n_layer=36, n_head=20, n_embd=5120 # for XL -> n_layer=48, n_head=24, n_embd=6400 # [DataLoader options] batch_size=2, prefetch_factor=10, num_workers=1, shuffle_buffer_size=1000, lr=1e-4, num_warmup_steps=0, num_training_steps=None, ): super().__init__() # Load tokenzier tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model) self._tokenizer = tokenizer # Load or initialize model if from_pretrained: config = transformers.GPT2Config.from_pretrained(from_pretrained) model = transformers.GPT2LMHeadModel.from_pretrained( from_pretrained) else: # Prepare model config = transformers.GPT2Config( vocab_size=len(tokenizer), tokenizer_class=tokenizer.__class__.__name__, bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, sep_token_id=tokenizer.sep_token_id, cls_token_id=tokenizer.cls_token_id, unk_token_id=tokenizer.unk_token_id, # n_layer=n_layer, n_head=n_head, n_embd=n_embd) model = transformers.GPT2LMHeadModel(config) self.model = model self._config = config self._train_file = train_file self._valid_file = valid_file self._test_file = test_file self._batch_size = batch_size self._prefetch_factor = prefetch_factor self._num_workers = num_workers self._shuffle_buffer_size = shuffle_buffer_size self._lr = lr self._num_warmup_steps = num_warmup_steps self._num_training_steps = num_training_steps
split_lens[1] = len(dataset) - split_lens[0] train_set, valid_set = torch.utils.data.random_split(dataset, split_lens) print("Loading Model...") config = transformers.GPT2Config( vocab_size=261, n_positions=seq_len, n_ctx=seq_len, n_embd=30, n_layer=3, n_head=3 ) model = transformers.GPT2LMHeadModel(config=config) print("Training Model...") writer = SummaryWriter() training_args = transformers.TrainingArguments( output_dir="models/gpt2/", do_train=True, do_eval=True, evaluate_during_training=True, per_device_train_batch_size=32, per_device_eval_batch_size=32, num_train_epochs=1, logging_first_step=True, save_steps=2000,