Python BertTokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: model.tokenization

Класс/Тип: BertTokenizer

Примеров на hotexamples.com: 8

Python BertTokenizer - 8 примеров найдено. Это лучшие примеры Python кода для model.tokenization.BertTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

from_pretrained(8)

BertTokenizer(2)

save_vocab(2)

Пример #1

Показать файл

 def __init__(self, mask_rate, bert_model, do_lower_case, max_seq_length):
     super(RandMask, self).__init__()
     self.mask_rate = mask_rate
     self.max_seq_length = max_seq_length
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_model, do_lower_case=do_lower_case)
     self.vocab = list(self.tokenizer.vocab.keys())

Пример #2

Показать файл

Файл: sc_mask_gen.py Проект: zhyq/SelectiveMasking

 def __init__(self,
              mask_rate,
              top_sen_rate,
              threshold,
              bert_model,
              do_lower_case,
              max_seq_length,
              label_list,
              sen_batch_size,
              use_gpu=True):
     super(SC, self).__init__()
     self.mask_rate = mask_rate
     self.top_sen_rate = top_sen_rate
     self.threshold = threshold
     self.label_list = label_list
     self.num_labels = len(self.label_list)
     self.max_seq_length = max_seq_length
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_model, do_lower_case=do_lower_case)
     self.model = BertForSequenceClassification.from_pretrained(
         bert_model, num_labels=self.num_labels)
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() and use_gpu else "cpu")
     self.model.to(self.device)
     self.n_gpu = torch.cuda.device_count()
     self.sen_batch_size = sen_batch_size
     self.vocab = list(self.tokenizer.vocab.keys())
     if self.n_gpu > 1:
         self.model = torch.nn.DataParallel(self.model)

Пример #3

Показать файл

Файл: evaluate.py Проект: shimdx/nlp_classification

def get_preprocessor(ptr_config_info, model_config):
    with open(ptr_config_info.vocab, mode='rb') as io:
        vocab = pickle.load(io)

    if model_config.type == 'etri':
        ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence)
    elif model_config.type == 'skt':
        ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    return preprocessor

Пример #4

Показать файл

Файл: sc_mask_gen.py Проект: zhyq/SelectiveMasking

 def __init__(self,
              mask_rate,
              bert_model,
              do_lower_case,
              max_seq_length,
              sen_batch_size,
              with_rand=False,
              use_gpu=True):
     super(ModelGen, self).__init__()
     self.mask_rate = mask_rate
     self.max_seq_length = max_seq_length
     self.tokenizer = BertTokenizer.from_pretrained(
         bert_model, do_lower_case=do_lower_case)
     self.model = BertForTokenClassification.from_pretrained(bert_model,
                                                             num_labels=2)
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() and use_gpu else "cpu")
     self.model.to(self.device)
     self.n_gpu = torch.cuda.device_count()
     self.sen_batch_size = sen_batch_size
     self.vocab = list(self.tokenizer.vocab.keys())
     self.with_rand = with_rand
     if self.n_gpu > 1:
         self.model = torch.nn.DataParallel(self.model)

Пример #5

Показать файл

Файл: train.py Проект: Lisennlp/chinese_extraction_mrc

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_squad_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative)
        train_dataloader = prepare_data(train_examples,
                                        tokenizer,
                                        args,
                                        task_name="train")
        num_train_optimization_steps = int(
            len(train_dataloader) /
            args.gradient_accumulation_steps) * args.num_train_epochs

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    def eval_model(model, data_loader):
        eval_loss = 0
        model.eval()
        for step, batch in enumerate(
                tqdm(data_loader,
                     desc="Evaluating",
                     disable=args.local_rank not in [-1, 0])):
            if n_gpu == 1:
                batch = tuple(
                    t.to(device)
                    for t in batch)  # multi-gpu does scattering it-self
            input_ids, input_mask, segment_ids, start_positions, end_positions = batch
            loss = model(input_ids, segment_ids, input_mask, start_positions,
                         end_positions)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            eval_loss += loss.detach().cpu()
        return eval_loss / (step + 1)

    eval_examples = read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative)
    eval_dataloader = prepare_data(eval_examples,
                                   tokenizer,
                                   args,
                                   task_name='eval')
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
        model.train()
        train_loss = 0
        min_loss = 100000
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Training",
                         disable=args.local_rank not in [-1, 0])):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                # print(f'start_positions = {start_positions}')
                # print(f'end_positions = {end_positions}')
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                train_loss += loss.detach().cpu()
            eval_loss = eval_model(model, eval_dataloader)
            print(
                f'epoch {epoch} train loss {train_loss / (step + 1)} eval_loss {eval_loss}'
            )

            if eval_loss < min_loss:
                min_loss = eval_loss
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                # If we save using the predefined names, we can load using `from_pretrained`
                output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                torch.save(model_to_save.state_dict(), output_model_file)
                model_to_save.config.to_json_file(output_config_file)
                tokenizer.save_vocabulary(args.output_dir)

    elif args.do_predict:
        predict_dataloader, predict_features = prepare_data(
            eval_examples, tokenizer, args, task_name='predict')
        all_results = []
        logger.info("Start Predicting")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                predict_dataloader,
                desc="Predicting",
                disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):

                start_logits = batch_start_logits[i].detach().cpu().tolist(
                )  # 某一句话的答案起始位置, 长度为len
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                predict_features = predict_features[
                    example_index.item()]  # 某句话的特征
                unique_id = int(predict_features.unique_id)  # 某句话的id

                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")
        write_predictions(eval_examples, predict_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          args.verbose_logging, args.version_2_with_negative,
                          args.null_score_diff_threshold)
    else:
        raise ValueError(
            'please confirm at least one task mode, such as ’train‘ or ’predict‘.'
        )

Пример #6

Показать файл

def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--ckpt", default="", type=str)
    parser.add_argument(
        "--vocab_file",
        default="",
        type=str,
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--sample_weight', type=float, default=1)
    parser.add_argument("--save_all", action="store_true")
    args = parser.parse_args()

    processors = {"maskgen": MaskGenProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        logger.warning(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    if args.vocab_file:
        tokenizer = BertTokenizer(args.vocab_file, args.do_lower_case)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)

    # Prepare model
    model = BertForTokenClassification.from_pretrained(args.bert_model,
                                                       num_labels=num_labels)

    if args.ckpt:
        print("load from", args.ckpt)
        model_dict = model.state_dict()
        ckpt = torch.load(args.ckpt)
        pretrained_dict = ckpt['model']
        new_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict.keys()
        }
        model_dict.update(new_dict)
        print('Total : {}, update: {}'.format(len(pretrained_dict),
                                              len(new_dict)))
        model.load_state_dict(model_dict)

    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)

        if args.fp16:
            sample_weight = torch.HalfTensor([1.0, args.sample_weight]).cuda()
        else:
            sample_weight = torch.FloatTensor([1.0, args.sample_weight]).cuda()

        cached_train_features_file = os.path.join(
            args.data_dir, 'train_{}_{}_{}'.format(
                list(filter(None, args.bert_model.split('/'))).pop(),
                str(args.max_seq_length), str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                logger.info("Load from cache dir: {}".format(
                    cached_train_features_file))
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("Saving train features into cached file {}".format(
                    cached_train_features_file))
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        num_train_optimization_steps = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

        label_map = {i: label for i, label in enumerate(label_list, 1)}

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        os.makedirs(os.path.join(args.output_dir, "all_models"), exist_ok=True)
        model.train()
        for e in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids,
                             segment_ids,
                             input_mask,
                             label_ids,
                             weight=sample_weight)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            # save each epoch
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir, "all_models",
                                             "e{}_{}".format(e, WEIGHTS_NAME))
            torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)
    else:
        model = BertForTokenClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        best_f1 = 0
        best_epoch = 0
        val_res_file = os.path.join(args.output_dir, "valid_results.txt")
        val_f = open(val_res_file, "w")
        logger.info("***** Dev Eval results *****")
        for e in range(int(args.num_train_epochs)):
            weight_path = os.path.join(args.output_dir, "all_models",
                                       "e{}_{}".format(e, WEIGHTS_NAME))
            model.load_state_dict(torch.load(weight_path))
            model.to(device)
            eval_examples = processor.get_dev_examples(args.data_dir)

            cached_eval_features_file = os.path.join(
                args.data_dir, 'dev_{0}_{1}_{2}'.format(
                    list(filter(None, args.bert_model.split('/'))).pop(),
                    str(args.max_seq_length), str(task_name)))
            try:
                with open(cached_eval_features_file, "rb") as reader:
                    eval_features = pickle.load(reader)
            except:
                eval_features = convert_examples_to_features(
                    eval_examples, label_list, args.max_seq_length, tokenizer)
                if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                    logger.info("  Saving eval features into cached file %s",
                                cached_eval_features_file)
                    with open(cached_eval_features_file, "wb") as writer:
                        pickle.dump(eval_features, writer)

            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            # Run prediction for full data
            if args.local_rank == -1:
                eval_sampler = SequentialSampler(eval_data)
            else:
                eval_sampler = DistributedSampler(
                    eval_data)  # Note that this sampler samples randomly
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            y_true_L = []
            y_pred_L = []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(
                    eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids, segment_ids, input_mask)

                logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                input_mask = input_mask.to('cpu').numpy()

                y_true = [[str(x) for x in L] for L in label_ids]
                y_pred = [[str(x) for x in L] for L in logits]

                for (m, t, p) in zip(input_mask, y_true, y_pred):
                    for mm, tt, pp in zip(m, t, p):
                        if mm == 1:
                            y_true_L.append(int(tt))
                            y_pred_L.append(int(pp))

            acc = accuracy_score(y_true_L, y_pred_L)
            f1 = f1_score(y_true_L, y_pred_L)
            recall = recall_score(y_true_L, y_pred_L)
            prec = precision_score(y_true_L, y_pred_L)

            if f1 > best_f1:
                best_f1 = f1
                best_epoch = e

            result = {"acc": acc, "f1": f1, "recall": recall, "prec": prec}

            logger.info("Epoch {}".format(e))
            val_f.write("Epoch {}\n".format(e))
            for key in sorted(result.keys()):
                logger.info("{} = {}".format(key, str(result[key])))
                val_f.write("{} = {}\n".format(key, str(result[key])))
            val_f.write("\n")

        logger.info("\nBest epoch: {}. Best val f1: {}".format(
            best_epoch, best_f1))
        val_f.write("Best epoch: {}. Best val f1: {}\n".format(
            best_epoch, best_f1))
        val_f.close()

        best_weight_path = os.path.join(
            args.output_dir, "all_models",
            "e{}_{}".format(best_epoch, WEIGHTS_NAME))
        best_model_dir = os.path.join(args.output_dir, "best_model")
        os.makedirs(best_model_dir, exist_ok=True)
        os.system("cp {} {}/{}".format(best_weight_path, best_model_dir,
                                       WEIGHTS_NAME))
        with open(os.path.join(best_model_dir, CONFIG_NAME), 'w') as f:
            f.write(model_to_save.config.to_json_string())
        tokenizer.save_vocab(os.path.join(best_model_dir, VOCAB_NAME))

        if not args.save_all:
            os.system("rm -r {}".format(
                os.path.join(args.output_dir, "all_models")))

Пример #7

Показать файл

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--vocab_file", default="", type=str)
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--ckpt", type=str, help="ckpt position")
    parser.add_argument("--save_all", action="store_true")
    parser.add_argument("--output_dev_detail", action="store_true")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    if args.vocab_file:
        tokenizer = BertTokenizer(args.vocab_file, args.do_lower_case)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)

    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, num_labels=num_labels)

    if args.ckpt:
        print("load from", args.ckpt)
        model_dict = model.state_dict()
        ckpt = torch.load(args.ckpt)
        if "model" in ckpt:
            pretrained_dict = ckpt['model']
        else:
            pretrained_dict = ckpt
        new_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict.keys()
            and k not in ["classifier.weight", "classifier.bias"]
        }
        model_dict.update(new_dict)
        print('Total : {}, update: {}'.format(len(pretrained_dict),
                                              len(new_dict)))
        model.load_state_dict(model_dict)

    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    if args.do_train:
        # Prepare data loader
        train_examples = processor.get_train_examples(args.data_dir)
        print(len(train_examples))
        cached_train_features_file = os.path.join(
            args.data_dir, 'train_{0}_{1}_{2}'.format(
                list(filter(None, args.bert_model.split('/'))).pop(),
                str(args.max_seq_length), str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer,
                output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = [f.input_ids for f in train_features]
        all_input_mask = [f.input_mask for f in train_features]
        all_segment_ids = [f.segment_ids for f in train_features]

        if output_mode == "classification":
            all_label_ids = [f.label_id for f in train_features]
        elif output_mode == "regression":
            all_label_ids = [f.label_id for f in train_features]

        train_data = InputDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      collate_fn=train_data.collate)

        num_train_optimization_steps = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        os.makedirs(os.path.join(args.output_dir, "all_models"), exist_ok=True)
        model.train()
        for e in trange(int(args.num_train_epochs),
                        desc="Epoch",
                        disable=args.local_rank not in [-1, 0]):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):
                inputs, labels = batch
                for key in inputs.keys():
                    inputs[key] = inputs[key].to(args.device)
                for key in labels.keys():
                    labels[key] = labels[key].to(args.device)
                # define a new function to compute loss values for both output_modes
                label_ids = labels["labels"]
                logits = model(**inputs)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels),
                                    label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            # save each epoch
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir, "all_models",
                                             "e{}_{}".format(e, WEIGHTS_NAME))
            torch.save(model_to_save.state_dict(), output_model_file)

    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        best_acc = 0
        best_epoch = 0
        val_res_file = os.path.join(args.output_dir, "valid_results.txt")
        val_f = open(val_res_file, "w")
        if args.output_dev_detail:
            logger.info("***** Dev Eval results *****")
        for e in tqdm(range(int(args.num_train_epochs)), desc="Epoch on dev"):
            weight_path = os.path.join(args.output_dir, "all_models",
                                       "e{}_{}".format(e, WEIGHTS_NAME))
            result = evaluate(args,
                              model,
                              weight_path,
                              processor,
                              device,
                              task_name,
                              "dev",
                              label_list,
                              tokenizer,
                              output_mode,
                              num_labels,
                              show_detail=False)
            if result["acc"] > best_acc:
                best_acc = result["acc"]
                best_epoch = e

            if args.output_dev_detail:
                logger.info("Epoch {}".format(e))
            val_f.write("Epoch {}\n".format(e))
            for key in sorted(result.keys()):
                if args.output_dev_detail:
                    logger.info("{} = {}".format(key, str(result[key])))
                val_f.write("{} = {}\n".format(key, str(result[key])))
            val_f.write("\n")

        logger.info("\nBest epoch: {}. Best val acc: {}".format(
            best_epoch, best_acc))
        val_f.write("Best epoch: {}. Best val acc: {}\n".format(
            best_epoch, best_acc))
        val_f.close()

        test_weight_path = os.path.join(
            args.output_dir, "all_models",
            "e{}_{}".format(best_epoch, WEIGHTS_NAME))
        test_result = evaluate(args, model, test_weight_path, processor,
                               device, task_name, "test", label_list,
                               tokenizer, output_mode, num_labels)
        test_res_file = os.path.join(args.output_dir, "test_results.txt")

        logger.info("***** Test Eval results *****")
        with open(test_res_file, "w") as test_f:
            for key in sorted(test_result.keys()):
                logger.info("{} = {}".format(key, str(test_result[key])))
                test_f.write("{} = {}\n".format(key, str(test_result[key])))

        best_model_dir = os.path.join(args.output_dir, "best_model")
        os.makedirs(best_model_dir, exist_ok=True)
        os.system("cp {} {}/{}".format(test_weight_path, best_model_dir,
                                       WEIGHTS_NAME))
        with open(os.path.join(best_model_dir, CONFIG_NAME), 'w') as f:
            f.write(model_to_save.config.to_json_string())
        tokenizer.save_vocab(os.path.join(best_model_dir, VOCAB_NAME))

        if not args.save_all:
            os.system("rm -r {}".format(
                os.path.join(args.output_dir, "all_models")))

Пример #8

Показать файл

Файл: prepare_vocab_and_weights.py Проект: YeoHoonYun/nlp_classification

        zipfile_path = ptr_dir / "etri.zip"

        if not zipfile_path.exists():
            url = "https://drive.google.com/uc?id=1qVY-zZc2O2OliGNUwWClhcqJkLG_6uoD"
            gdown.download(url, output=str(zipfile_path))

            with zipfile.ZipFile(str(zipfile_path)) as unzip:
                unzip.extractall(str(ptr_dir))

            from model.tokenization import BertTokenizer as ETRITokenizer
            # loading BertTokenizer
            ptr_config_path = ptr_dir / 'bert_config_etri.json'
            ptr_tokenizer_path = ptr_dir / "vocab.korean.rawtext.list"
            ptr_bert_path = ptr_dir / "pytorch_model_etri.bin"

            ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_tokenizer_path,
                                                          do_lower_case=False)
            # generate vocab
            idx_to_token = list(ptr_tokenizer.vocab.keys())
            token_to_idx = {
                token: idx
                for idx, token in enumerate(idx_to_token)
            }

            vocab = Vocab(
                idx_to_token,
                padding_token="[PAD]",
                unknown_token="[UNK]",
                bos_token=None,
                eos_token=None,
                reserved_tokens=["[CLS]", "[SEP]", "[MASK]"],
                token_to_idx=token_to_idx,