Exemplo n.º 1
0
 def setUp(self):
     self.set_attr()
     self.create_input_file()
     self.set_test_case()
     self.tokenizer = RobertaTokenizer(vocab_file=self.vocab_file,
                                       merges_file=None,
                                       do_lower_case=self.do_lower_case)
Exemplo n.º 2
0
    def test_from_pretrained_pad_left(self):
        tokenizer = RobertaTokenizer.from_pretrained("roberta-wwm-ext")
        tokenizer.padding_side = "left"
        text1 = "这是一个简单文本"
        text2 = "小孩子都看得懂"
        # test batch_encode
        expected_input_ids = [
            0, 0, 101, 6821, 3221, 671, 702, 5042, 1296, 3152, 3315, 102, 2207,
            2111, 2094, 6963, 4692, 2533, 2743, 102
        ]
        expected_token_type_ids = [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
        ]
        expected_attention_mask = [
            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
        ]
        expected_special_tokens_mask = [
            1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1
        ]
        results = tokenizer([text1], [text2],
                            20,
                            stride=1,
                            pad_to_max_seq_len=True,
                            return_attention_mask=True,
                            return_special_tokens_mask=True)

        self.check_output_equal(results[0]['input_ids'], expected_input_ids)
        self.check_output_equal(results[0]['token_type_ids'],
                                expected_token_type_ids)
        self.check_output_equal(results[0]['attention_mask'],
                                expected_attention_mask)
        self.check_output_equal(results[0]['special_tokens_mask'],
                                expected_special_tokens_mask)
        # test encode
        results = tokenizer(text1,
                            text2,
                            20,
                            stride=1,
                            pad_to_max_seq_len=True)
        self.check_output_equal(results['input_ids'], expected_input_ids)
        self.check_output_equal(results['token_type_ids'],
                                expected_token_type_ids)
Exemplo n.º 3
0
def train():
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = ErnieForGeneration.from_pretrained(args.model_name_or_path)
    if "ernie-tiny" in args.model_name_or_path:
        tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path)
    elif "ernie" in args.model_name_or_path:
        tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path:
        tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    elif "electra" in args.model_name_or_path:
        tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    if args.init_checkpoint:
        model_state = paddle.load(args.init_checkpoint)
        model.set_state_dict(model_state)

    train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev'])
    attn_id = tokenizer.vocab[
        '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]']
    tgt_type_id = model.sent_emb.weight.shape[0] - 1

    trans_func = convert_example(tokenizer=tokenizer,
                                 attn_id=attn_id,
                                 tgt_type_id=tgt_type_id,
                                 max_encode_len=args.max_encode_len,
                                 max_decode_len=args.max_decode_len,
                                 noise_prob=args.noise_prob,
                                 use_random_noice=args.use_random_noice)

    train_dataset = train_dataset.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # attn_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_labels
    ): after_padding(fn(samples))
    train_data_loader = DataLoader(dataset=train_dataset,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = DataLoader(dataset=dev_dataset,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    label_num = model.word_emb.weight.shape[0]
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    max_steps = len(train_data_loader) * args.num_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
                                         args.warmup_proportion)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=nn.ClipGradByGlobalNorm(1.0),
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    rouge1 = Rouge1()
    rouge2 = Rouge2()

    global_step = 1
    tic_train = time.time()
    for epoch in range(args.num_epochs):
        for step, batch in enumerate(train_data_loader, start=1):
            (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids,
             attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
             mask_attn_2_srctgtattn, tgt_labels, _) = batch
            # import pdb; pdb.set_trace()
            _, __, info = model(src_ids,
                                sent_ids=src_sids,
                                pos_ids=src_pids,
                                attn_bias=mask_src_2_src,
                                encode_only=True)
            cached_k, cached_v = info['caches']
            _, __, info = model(tgt_ids,
                                sent_ids=tgt_sids,
                                pos_ids=tgt_pids,
                                attn_bias=mask_tgt_2_srctgt,
                                past_cache=(cached_k, cached_v),
                                encode_only=True)
            cached_k2, cached_v2 = info['caches']
            past_cache_k = [
                paddle.concat([k, k2], 1)
                for k, k2 in zip(cached_k, cached_k2)
            ]
            past_cache_v = [
                paddle.concat([v, v2], 1)
                for v, v2 in zip(cached_v, cached_v2)
            ]
            if args.label_smooth > 0.:
                tgt_labels = nn.functional.label_smooth(
                    nn.functional.one_hot(tgt_labels, label_num),
                    epsilon=args.label_smooth)
            loss, _, __ = model(attn_ids,
                                sent_ids=tgt_sids,
                                pos_ids=tgt_pids,
                                attn_bias=mask_attn_2_srctgtattn,
                                past_cache=(past_cache_k, past_cache_v),
                                tgt_labels=tgt_labels,
                                tgt_pos=paddle.nonzero(attn_ids == attn_id))
            if global_step % args.logging_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e"
                        % (global_step, epoch, step, loss, args.logging_steps /
                           (time.time() - tic_train), lr_scheduler.get_lr()))
                tic_train = time.time()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % args.save_steps == 0 and (
                (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0):
                evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2,
                         attn_id, tgt_type_id, args)
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
            global_step += 1
Exemplo n.º 4
0
def train():
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = ErnieForGeneration.from_pretrained(args.model_name_or_path)
    if "ernie-tiny" in args.model_name_or_path:
        tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path)
    elif "ernie" in args.model_name_or_path:
        tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path:
        tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    elif "electra" in args.model_name_or_path:
        tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
    if args.init_checkpoint:
        model_state = paddle.load(args.init_checkpoint)
        model.set_state_dict(model_state)

    train_dataset, dev_dataset = load_dataset(
        'poetry', splits=('train', 'dev'), lazy=False)
    attn_id = tokenizer.vocab[
        '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]']
    tgt_type_id = model.sent_emb.weight.shape[0] - 1

    trans_func = convert_example(
        tokenizer=tokenizer,
        attn_id=attn_id,
        tgt_type_id=tgt_type_id,
        max_encode_len=args.max_encode_len,
        max_decode_len=args.max_decode_len,
        noise_prob=args.noise_prob,
        use_random_noice=args.use_random_noice)

    train_dataset = train_dataset.map(trans_func)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # src_tids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # tgt_tids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # attn_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_labels
    ): after_padding(fn(samples))
    train_data_loader = DataLoader(
        dataset=train_dataset,
        batch_sampler=train_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    dev_dataset = dev_dataset.map(trans_func)
    dev_data_loader = DataLoader(
        dataset=dev_dataset,
        batch_size=args.batch_size,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    label_num = model.word_emb.weight.shape[0]
    train_model = StackModel(model)
    if paddle.distributed.get_world_size() > 1:
        # All 'forward' outputs derived from the module parameters using in DataParallel
        # must participate in the calculation of losses and subsequent gradient calculations.
        # So we use StackModel here to make the model only output loss in its 'forward' function.
        train_model = paddle.DataParallel(train_model)

    max_steps = len(train_data_loader) * args.num_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=nn.ClipGradByGlobalNorm(1.0),
        apply_decay_param_fun=lambda x: x in decay_params)

    rouge1 = Rouge1()
    rouge2 = Rouge2()

    global_step = 1
    tic_train = time.time()
    for epoch in range(args.num_epochs):
        for step, batch in enumerate(train_data_loader, start=1):
            (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids,
             mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
             tgt_labels, _) = batch
            # import pdb; pdb.set_trace()
            if args.label_smooth > 0.:
                tgt_labels = nn.functional.label_smooth(
                    nn.functional.one_hot(tgt_labels, label_num),
                    epsilon=args.label_smooth)
            tgt_pos = paddle.nonzero(attn_ids == attn_id)
            loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids,
                               tgt_pids, attn_ids, mask_src_2_src,
                               mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
                               tgt_labels, tgt_pos)
            if global_step % args.logging_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e"
                        % (global_step, epoch, step, loss, args.logging_steps /
                           (time.time() - tic_train), lr_scheduler.get_lr()))
                tic_train = time.time()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 and paddle.distributed.get_rank(
            ) == 0:
                evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2,
                         attn_id, tgt_type_id, args)
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
            global_step += 1
Exemplo n.º 5
0
def predict():
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    model = ErnieForGeneration.from_pretrained(args.model_name_or_path)
    if "ernie-tiny" in args.model_name_or_path:
        tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path)
    elif "ernie" in args.model_name_or_path:
        tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path:
        tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    elif "electra" in args.model_name_or_path:
        tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    dev_dataset = Poetry.get_datasets(['dev'])
    attn_id = tokenizer.vocab[
        '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]']
    tgt_type_id = model.sent_emb.weight.shape[0] - 1

    trans_func = convert_example(tokenizer=tokenizer,
                                 attn_id=attn_id,
                                 tgt_type_id=tgt_type_id,
                                 max_encode_len=args.max_encode_len,
                                 max_decode_len=args.max_decode_len)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # attn_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_labels
    ): after_padding(fn(samples))

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    test_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    data_loader = DataLoader(dataset=dev_dataset,
                             batch_sampler=test_batch_sampler,
                             collate_fn=batchify_fn,
                             num_workers=0,
                             return_list=True)

    if args.init_checkpoint:
        model_state = paddle.load(args.init_checkpoint)
        model.set_state_dict(model_state)

    model.eval()
    vocab = tokenizer.vocab
    eos_id = vocab[tokenizer.sep_token]
    sos_id = vocab[tokenizer.cls_token]
    pad_id = vocab[tokenizer.pad_token]
    unk_id = vocab[tokenizer.unk_token]
    vocab_size = len(vocab)
    evaluated_sentences = []
    evaluated_sentences_ids = []
    logger.info("Predicting...")
    for data in data_loader:
        (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _,
         raw_tgt_labels) = data  # never use target when infer
        # Use greedy_search_infilling or beam_search_infilling to get predictions
        output_ids = beam_search_infilling(model,
                                           src_ids,
                                           src_sids,
                                           eos_id=eos_id,
                                           sos_id=sos_id,
                                           attn_id=attn_id,
                                           pad_id=pad_id,
                                           unk_id=unk_id,
                                           vocab_size=vocab_size,
                                           max_decode_len=args.max_decode_len,
                                           max_encode_len=args.max_encode_len,
                                           beam_width=args.beam_width,
                                           length_penalty=args.length_penalty,
                                           tgt_type_id=tgt_type_id)

        for source_ids, target_ids, predict_ids in zip(
                src_ids.numpy().tolist(),
                raw_tgt_labels.numpy().tolist(), output_ids.tolist()):
            if eos_id in predict_ids:
                predict_ids = predict_ids[:predict_ids.index(eos_id)]
            source_sentence = ''.join(
                map(post_process,
                    vocab.to_tokens(source_ids[1:source_ids.index(eos_id)])))
            tgt_sentence = ''.join(
                map(post_process,
                    vocab.to_tokens(target_ids[1:target_ids.index(eos_id)])))
            predict_ids = ''.join(
                map(post_process, vocab.to_tokens(predict_ids)))
            print("source :%s\ntarget :%s\npredict:%s\n" %
                  (source_sentence, tgt_sentence, predict_ids))
Exemplo n.º 6
0
def evaluate():
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    model = ErnieForGeneration.from_pretrained(args.model_name_or_path)
    if "ernie-tiny" in args.model_name_or_path:
        tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path)
    elif "ernie" in args.model_name_or_path:
        tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path:
        tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    elif "electra" in args.model_name_or_path:
        tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

    dev_dataset = Poetry.get_datasets(['dev'])
    attn_id = tokenizer.vocab[
        '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]']
    tgt_type_id = model.sent_emb.weight.shape[0] - 1

    trans_func = convert_example(tokenizer=tokenizer,
                                 attn_id=attn_id,
                                 tgt_type_id=tgt_type_id,
                                 max_encode_len=args.max_encode_len,
                                 max_decode_len=args.max_decode_len)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # src_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_pids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_sids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # attn_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tgt_labels
    ): after_padding(fn(samples))

    dev_dataset = dev_dataset.apply(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    data_loader = DataLoader(dataset=dev_dataset,
                             batch_sampler=dev_batch_sampler,
                             collate_fn=batchify_fn,
                             num_workers=0,
                             return_list=True)

    rouge1 = Rouge1()
    rouge2 = Rouge2()

    if args.init_checkpoint:
        model_state = paddle.load(args.init_checkpoint)
        model.set_state_dict(model_state)

    model.eval()
    vocab = tokenizer.vocab
    eos_id = vocab[tokenizer.sep_token]
    sos_id = vocab[tokenizer.cls_token]
    pad_id = vocab[tokenizer.pad_token]
    unk_id = vocab[tokenizer.unk_token]
    vocab_size = len(vocab)
    evaluated_sentences_ids = []
    reference_sentences_ids = []
    logger.info("Evaluating...")
    for data in tqdm(data_loader):
        (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _,
         raw_tgt_labels) = data  # never use target when infer
        # Use greedy_search_infilling or beam_search_infilling to get predictions
        output_ids = beam_search_infilling(model,
                                           src_ids,
                                           src_sids,
                                           eos_id=eos_id,
                                           sos_id=sos_id,
                                           attn_id=attn_id,
                                           pad_id=pad_id,
                                           unk_id=unk_id,
                                           vocab_size=vocab_size,
                                           max_decode_len=args.max_decode_len,
                                           max_encode_len=args.max_encode_len,
                                           beam_width=args.beam_width,
                                           length_penalty=args.length_penalty,
                                           tgt_type_id=tgt_type_id)

        for ids in output_ids.tolist():
            if eos_id in ids:
                ids = ids[:ids.index(eos_id)]
            evaluated_sentences_ids.append(ids)

        for ids in raw_tgt_labels.numpy().tolist():
            ids = ids[:ids.index(eos_id)]
            reference_sentences_ids.append(ids)

    score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids)
    score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids)

    logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100))
Exemplo n.º 7
0
class TestRobertaTokenizer(CpuCommonTest):
    def set_attr(self):
        self.do_lower_case = True

    def create_input_file(self):
        vocab = [
            "[UNK]", "[CLS]", "[SEP]", "th", "##is", "is", "simple", "text",
            "a", "an", "for", "easy", "which", "children", "[MASK]", "[PAD]"
        ]
        curr_dir = os.path.dirname(os.path.realpath(__file__))
        vocab_file = os.path.join(curr_dir, "vocab.txt")
        with open(vocab_file, "w") as fw:
            for v in vocab:
                fw.write(v)
                fw.write('\n')
        self.vocab_file = vocab_file
        self.vocab = vocab

    def set_test_case(self):
        self.text = "this is a simple text"
        self.expected_text_array = ['th', '##is', 'is', 'a', 'simple', 'text']

    def setUp(self):
        self.set_attr()
        self.create_input_file()
        self.set_test_case()
        self.tokenizer = RobertaTokenizer(vocab_file=self.vocab_file,
                                          merges_file=None,
                                          do_lower_case=self.do_lower_case)

    def test_tokenize(self):
        text_array = self.tokenizer.tokenize(self.text)
        self.check_output_equal(text_array, self.expected_text_array)
        self.check_output_equal(
            self.tokenizer.convert_tokens_to_string(text_array), self.text)

    def test_call(self):
        expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2]
        expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0]
        expected_attention_mask = [1] * len(expected_input_ids)
        expected_tokens_mask = [1, 0, 0, 0, 0, 0, 0, 1]
        result = self.tokenizer("This  is a simple text",
                                return_attention_mask=True,
                                return_length=True,
                                return_special_tokens_mask=True)
        self.check_output_equal(result['input_ids'], expected_input_ids)
        self.check_output_equal(result['token_type_ids'],
                                expected_token_type_ids)
        self.check_output_equal(result['seq_len'],
                                len(expected_token_type_ids))
        self.check_output_equal(result['attention_mask'],
                                expected_attention_mask)
        self.check_output_equal(result['special_tokens_mask'],
                                expected_tokens_mask)

    def test_call_pair(self):
        expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2, 12, 5, 11, 10, 13, 2]
        expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

        result = self.tokenizer("This is a simple text",
                                "which is easy for children")
        self.check_output_equal(result['input_ids'], expected_input_ids)
        self.check_output_equal(result['token_type_ids'],
                                expected_token_type_ids)

    def test_call_batch(self):
        expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2]
        expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0]
        results = self.tokenizer(
            ["This is a simple text", "this Is A    simple text"])
        for result in results:
            self.check_output_equal(result['input_ids'], expected_input_ids)
            self.check_output_equal(result['token_type_ids'],
                                    expected_token_type_ids)

    def test_call_truncate_seq(self):
        expected_input_ids = [1, 3, 2, 3, 2]
        expected_token_type_ids = [0, 0, 0, 1, 1]
        results = self.tokenizer("This is a simple text",
                                 "this Is A    simple text", 5)

        self.check_output_equal(results['input_ids'], expected_input_ids)
        self.check_output_equal(results['token_type_ids'],
                                expected_token_type_ids)

    # Test PretrainedTokenizer
    def test_truncate_only_first(self):
        ids = [1, 3, 4, 5, 8, 6, 7, 2]
        pair_ids = [12, 5, 11, 10, 13, 2]

        truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences(
            ids, pair_ids, 3, truncation_strategy='only_first')
        self.check_output_equal(truncate_ids, ids[:-3])
        self.check_output_equal(truncate_pair_ids, pair_ids)

    def test_truncate_only_second(self):
        ids = [1, 3, 4, 5, 8, 6, 7, 2]
        pair_ids = [12, 5, 11, 10, 13, 2]

        truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences(
            ids, pair_ids, 3, truncation_strategy='only_second')
        self.check_output_equal(truncate_ids, ids)
        self.check_output_equal(truncate_pair_ids, pair_ids[:-3])

    @assert_raises(ValueError)
    def test_truncate_do_not_truncate(self):
        ids = [1, 3, 4, 5, 8, 6, 7, 2]
        pair_ids = [12, 5, 11, 10, 13, 2]
        truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences(
            ids, pair_ids, 3, truncation_strategy='do_not_truncate')

    @assert_raises(ValueError)
    def test_truncate_error_strategy(self):
        ids = [1, 3, 4, 5, 8, 6, 7, 2]
        pair_ids = [12, 5, 11, 10, 13, 2]
        truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences(
            ids, pair_ids, 1, truncation_strategy='')

    def test_save_pretrained(self):
        curr_dir = os.path.dirname(os.path.realpath(__file__))
        model_path = os.path.join(curr_dir, "pretrained_model")
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        self.tokenizer.save_pretrained(model_path)

        vocab_path = os.path.join(
            model_path, self.tokenizer.resource_files_names['vocab_file'])
        with open(vocab_path, "r") as fr:
            vocabs = [vocab.strip() for vocab in fr.readlines()]
        self.check_output_equal(vocabs, self.vocab)

    @assert_raises(RuntimeError)
    def test_from_pretrained_non_exist(self):
        RobertaTokenizer.from_pretrained("")

    def test_vocab_size(self):
        self.check_output_equal(self.tokenizer.vocab_size, len(self.vocab))

    def test_all_special_tokens(self):
        expected_special_tokens_set = set([
            self.tokenizer.pad_token, self.tokenizer.mask_token,
            self.tokenizer.cls_token, self.tokenizer.unk_token,
            self.tokenizer.sep_token
        ])
        self.check_output_equal(set(self.tokenizer.all_special_tokens),
                                expected_special_tokens_set)
        self.check_output_equal(set(self.tokenizer.all_special_ids),
                                set([0, 1, 2, 14, 15]))

    @assert_raises(ValueError)
    def test_non_exist_vocab_file(self):
        RobertaTokenizer("non_exist.txt", merges_file=None)
Exemplo n.º 8
0
 def test_non_exist_vocab_file(self):
     RobertaTokenizer("non_exist.txt", merges_file=None)
Exemplo n.º 9
0
 def test_from_pretrained_non_exist(self):
     RobertaTokenizer.from_pretrained("")