Пример #1
0
 def __init__(self, config, lr, num_warmup_steps, num_training_steps):
     super().__init__()
     self.model = transformers.GPT2LMHeadModel(config)
     self._config = config
     self._lr = lr
     self._num_warmup_steps = num_warmup_steps
     self._num_training_steps = num_training_steps
Пример #2
0
    def __init__(self,
                 vocab: nnlp.Vocab,
                 n_embd: int = 256,
                 n_layer: int = 2,
                 n_head: int = 2,
                 n_position: int = 128,
                 n_ctx: int = 128,
                 unk_hard_loss: float = -1.0):
        super(BiGPT2LM, self).__init__()

        config = transformers.GPT2Config(vocab_size=len(vocab),
                                         n_embd=n_embd,
                                         n_layer=n_layer,
                                         n_head=n_head,
                                         n_positions=n_position,
                                         n_ctx=n_ctx,
                                         output_hidden_states=True)

        self.gpt2model_fwd = transformers.GPT2LMHeadModel(config)
        self.gpt2model_rev = transformers.GPT2LMHeadModel(config)

        self.vocab = vocab
        self.unk_hard_loss = unk_hard_loss
Пример #3
0
def get_model(tokenizer, resume=False):
    if cfg('random_init'):
        # load randomly initialized model instead of pretrained
        model_config = transformers.GPT2Config()
        model = transformers.GPT2LMHeadModel(model_config)
    elif resume:
        # resume from previous best
        model = AutoModelForCausalLM.from_pretrained(
            cfg('out_path') + cfg('name'))
    else:
        # load pretrained model
        model = AutoModelForCausalLM.from_pretrained(cfg('model'))
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(cfg('device'))
    return model
Пример #4
0
    def __init__(self,
                 vocab: nnlp.Vocab,
                 n_embd: int = 256,
                 n_layer: int = 4,
                 n_head: int = 4,
                 n_position: int = 128,
                 n_ctx: int = 128):
        super(GPT2Wrap, self).__init__()

        config = transformers.GPT2Config(vocab_size=len(vocab),
                                         n_embd=n_embd,
                                         n_layer=n_layer,
                                         n_head=n_head,
                                         n_positions=n_position,
                                         n_ctx=n_ctx,
                                         output_hidden_states=True)

        self.gpt2_model = transformers.GPT2LMHeadModel(config)
        self.vocab = vocab
        self.n_vocab = len(vocab)
Пример #5
0
def create_model(hparams, dictionary):
    # Config docs: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2config
    model = transformers.GPT2LMHeadModel(
        transformers.GPT2Config(vocab_size=len(dictionary),
                                n_embd=hparams["embedding_dim"],
                                n_layer=hparams["n_layer"],
                                n_head=hparams["n_head"],
                                n_positions=hparams['max_seq_length'],
                                n_ctx=hparams['max_seq_length']))

    if hparams["load_checkpoint"]:
        model.load_state_dict(
            torch.load(hparams["load_checkpoint"],
                       map_location=lambda storage, location: storage))

    if hparams["use_multi_gpu"]:
        assert torch.cuda.device_count() > 1
        print("Using %d GPUs" % torch.cuda.device_count())
        model = torch.nn.DataParallel(model)

    optim = torch.optim.Adam(model.parameters(), lr=hparams["lr"])

    return model, optim
Пример #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='設定要使用的顯卡,以逗號區隔')
    parser.add_argument('--model_config',
                        type=str,
                        required=False,
                        help='模型參數設定檔的路徑')
    parser.add_argument('--tokenizer_path',
                        type=str,
                        required=True,
                        help='選擇字典檔的路徑')
    parser.add_argument('--raw_data_path',
                        type=str,
                        required=True,
                        help='訓練用語料庫的路徑')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='語料庫 Tokenized 後的存放路徑')
    parser.add_argument('--raw',
                        action='store_true',
                        help='是否已做過 Tokenization')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='設定 Epochs')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='設定 Batch Size')
    parser.add_argument('--lr',
                        default=3e-5,
                        type=float,
                        required=False,
                        help='設定 Learning Rate')
    parser.add_argument('--warmup_steps',
                        default=0.1,
                        type=float,
                        required=False,
                        help='設定 Warmup Steps 的比例')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='Loss 紀錄的間隔,必須是 Gradient Accumulation 的整數倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='設定訓練語料庫的窗口大小')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度累積')
    parser.add_argument('--fp16', action='store_true', help='是否使用半精度浮點數')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='將訓練語料庫分成多少份')
    parser.add_argument('--min_length',
                        default=1,
                        type=int,
                        required=False,
                        help='文章最短長度,若文章長度不足將被捨棄')
    parser.add_argument('--output_dir', type=str, required=True, help='模型輸出路徑')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型起始路徑')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard 輸出路徑')
    parser.add_argument('--segment', action='store_true', help='是否以詞為單位')
    parser.add_argument('--bpe_token',
                        action='store_true',
                        help='使用 Byte Pair Encoding')
    parser.add_argument('--encoder_json',
                        default='tokenizations/encoder.json',
                        type=str,
                        help='encoder.json')
    parser.add_argument('--vocab_bpe',
                        default='tokenizations/vocab.bpe',
                        type=str,
                        help='vocab.bpe')
    parser.add_argument('--timezone',
                        default=8,
                        type=int,
                        help='手動指定時區,預設為 GMT+8')
    parser.add_argument('--epoch_save',
                        default=1,
                        type=int,
                        help='每隔幾個 Epoch 就存一次權重')

    args = parser.parse_args()
    print(f'Arguments: {args.__repr__()}')

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    # 設定要使用的顯卡
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device

    model_config = transformers.GPT2Config.from_json_file(args.model_config)
    print(f'Config:\n{model_config.to_json_string()}')

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path,
            do_lower_case=False,
            do_basic_tokenize=False)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using Device: {device.upper()}')

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    # 不支援半精度浮點數的顯卡不要使用
    fp16 = args.fp16
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tz = args.timezone
    strlen = lambda n: len(str(n))
    get_time = lambda: datetime.utcnow() + timedelta(hours=tz)
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    os.makedirs(output_dir, exist_ok=True)

    if raw:
        print('Building from Raw Data')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    tokenizer=full_tokenizer,
                    min_length=min_length)

    if not args.pretrained_model:
        model = transformers.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)

    if torch.cuda.device_count() == 2:
        device_map = {
            0: [0, 1, 2, 3, 4],
            1: [5, 6, 7, 8, 9, 10, 11],
        }
        model.parallelize(device_map)
        # model.parallelize()
        print('Model Parallelism!')

    model.train()
    if torch.cuda.device_count() < 2:
        model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print(f'Number of Parameters: {num_parameters}')

    multi_gpu = False
    full_len = 0
    print('Calculating Total Steps')
    for i in tqdm(range(num_pieces)):
        _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt')
        with open(_fpath, 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    warmup_steps = int(total_steps * warmup_steps)
    print(f'Total Steps: {total_steps}')

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps)

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.'
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    # if torch.cuda.device_count() > 1:
    #     print(f'Using {torch.cuda.device_count()} GPUs')
    #     model = DataParallel(
    #         model, device_ids=[int(i) for i in args.device.split(',')])
    #     model.to(f'cuda:{model.device_ids[0]}')
    #     multi_gpu = True

    with TimeCost('Training'):
        print('Training Begin')
        overall_step = 0
        running_loss = 0

        for epoch in range(epochs):
            now = get_time()
            print(f'Epoch {epoch + 1} - Time: {now}')
            x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
            random.shuffle(x)
            piece_num = 0
            for i in x:
                _fpath = os.path.join(tokenized_data_path,
                                      f'tokenized_train_{i}.txt')
                with open(_fpath, 'r') as f:
                    line = f.read().strip()
                tokens = line.split()
                tokens = [int(token) for token in tokens]
                start_point = 0
                samples = []
                while start_point < len(tokens) - n_ctx:
                    samples.append(tokens[start_point:start_point + n_ctx])
                    start_point += stride
                if start_point < len(tokens):
                    idx = len(tokens) - n_ctx
                    samples.append(tokens[idx:])
                print(f'Tokenize {i} Sample Size: {len(samples)}')
                random.shuffle(samples)
                # 捨棄最後一個不足一個完整 Batch 的 Step
                _steps = len(samples) // batch_size
                # 若 Samples 數量小於 Batch Size 會發生完全沒有 Steps 可以訓練的問題
                # 不要把 num_pieces 設定的太大,也可以解決這個問題
                _steps = 1 if _steps <= 0 else _steps

                for step in range(_steps):
                    # prepare data
                    batch = samples[step * batch_size:(step + 1) * batch_size]
                    batch_inputs = []
                    for ids in batch:
                        int_ids = [int(x) for x in ids]
                        batch_inputs.append(int_ids)
                    _device = 'cuda:0' if torch.cuda.device_count(
                    ) > 1 else device
                    batch_inputs = torch.tensor(batch_inputs).long().to(
                        _device)

                    # forward pass
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_inputs)
                    loss, _ = outputs[:2]

                    # get loss
                    if multi_gpu:
                        loss = loss.mean()
                    if gradient_accumulation > 1:
                        loss = loss / gradient_accumulation

                    # loss backward
                    if fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       max_grad_norm)

                    # optimizer step
                    if (overall_step + 1) % gradient_accumulation == 0:
                        running_loss += loss.item()
                        optimizer.step()
                        optimizer.zero_grad()
                        scheduler.step()
                    if (overall_step + 1) % log_step == 0:
                        tb_writer.add_scalar(
                            'loss',
                            loss.item() * gradient_accumulation, overall_step)
                        ts = datetime.utcnow() + timedelta(hours=8)
                        ts = ts.strftime('%H:%M:%S')
                        display_loss = running_loss * gradient_accumulation
                        display_loss /= log_step / gradient_accumulation
                        print(
                            f'Time {ts} - '
                            f'Epoch {epoch + 1:{strlen(epochs)}d}/{epochs} - '
                            f'Step {step + 1:{strlen(_steps)}d}/{_steps} - '
                            f'Piece {piece_num + 1:{strlen(num_pieces)}d}/{num_pieces} - '
                            f'Loss {display_loss:.4f}')
                        running_loss = 0
                    overall_step += 1
                piece_num += 1

            if (epoch + 1) % args.epoch_save == 0:
                print(f'Saving Model of Epoch {epoch + 1}')
                model_output_dir = os.path.join(output_dir,
                                                f'model_epoch{epoch + 1}')
                os.makedirs(model_output_dir, exist_ok=True)
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.save_pretrained(model_output_dir)

            then = get_time()
            print(f'Epoch {epoch + 1} Finished - Time: {then}')
            delta = (then - now).total_seconds()
            mm, ss = delta // 60, delta % 60
            hh, mm = mm // 60, mm % 60
            print(
                f'Time Cost of the Epoch {epoch + 1} - {hh:.0f}:{mm:.0f}:{ss:.2f}'
            )

        print('Training Done')
    model_output_dir = os.path.join(output_dir, 'final_model')
    os.makedirs(model_output_dir, exist_ok=True)
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(model_output_dir)
Пример #7
0
    def __init__(
        self,
        tokenizer_model,
        train_file,
        valid_file,
        test_file,
        from_pretrained=None,
        block_size=1024,
        # [Model config]
        # for small
        n_layer=12,
        n_head=12,
        n_embd=768,
        # for medium -> n_layer=24, n_head=16, n_embd=1024
        # for large  -> n_layer=36, n_head=20, n_embd=5120
        # for XL     -> n_layer=48, n_head=24, n_embd=6400
        # [DataLoader options]
        batch_size=2,
        prefetch_factor=10,
        num_workers=1,
        shuffle_buffer_size=1000,
        lr=1e-4,
        num_warmup_steps=0,
        num_training_steps=None,
    ):
        super().__init__()

        # Load tokenzier
        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model)
        self._tokenizer = tokenizer

        # Load or initialize model
        if from_pretrained:
            config = transformers.GPT2Config.from_pretrained(from_pretrained)
            model = transformers.GPT2LMHeadModel.from_pretrained(
                from_pretrained)
        else:
            # Prepare model
            config = transformers.GPT2Config(
                vocab_size=len(tokenizer),
                tokenizer_class=tokenizer.__class__.__name__,
                bos_token_id=tokenizer.bos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                sep_token_id=tokenizer.sep_token_id,
                cls_token_id=tokenizer.cls_token_id,
                unk_token_id=tokenizer.unk_token_id,
                #
                n_layer=n_layer,
                n_head=n_head,
                n_embd=n_embd)
            model = transformers.GPT2LMHeadModel(config)

        self.model = model
        self._config = config

        self._train_file = train_file
        self._valid_file = valid_file
        self._test_file = test_file
        self._batch_size = batch_size
        self._prefetch_factor = prefetch_factor
        self._num_workers = num_workers
        self._shuffle_buffer_size = shuffle_buffer_size
        self._lr = lr
        self._num_warmup_steps = num_warmup_steps
        self._num_training_steps = num_training_steps
Пример #8
0
split_lens[1] = len(dataset) - split_lens[0]

train_set, valid_set = torch.utils.data.random_split(dataset, split_lens)

print("Loading Model...")

config = transformers.GPT2Config(
    vocab_size=261,
    n_positions=seq_len,
    n_ctx=seq_len,
    n_embd=30,
    n_layer=3,
    n_head=3
)

model = transformers.GPT2LMHeadModel(config=config)

print("Training Model...")

writer = SummaryWriter()

training_args = transformers.TrainingArguments(
    output_dir="models/gpt2/",
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_first_step=True,
    save_steps=2000,