def check_bert_tokenizer(first,
                         last,
                         expect_str,
                         vocab_list,
                         suffix_indicator='##',
                         max_bytes_per_token=100,
                         unknown_token='[UNK]',
                         lower_case=False,
                         keep_whitespace=False,
                         normalization_form=nlp.utils.NormalizeForm.NONE,
                         preserve_unused_token=False):
    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = nlp.Vocab.from_list(vocab_list)
    tokenizer_op = nlp.BertTokenizer(
        vocab=vocab,
        suffix_indicator=suffix_indicator,
        max_bytes_per_token=max_bytes_per_token,
        unknown_token=unknown_token,
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token)
    dataset = dataset.map(operations=tokenizer_op)
    count = 0
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text'])
        logger.info("Out:", text)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(text, expect_str[count])
        count = count + 1
示例#2
0
def test_bert_tokenizer_callable_invalid_input():
    """
    Test WordpieceTokenizer in eager mode with invalid input
    """
    data = {'张三': 18, '王五': 20}
    vocab = text.Vocab.from_list(vocab_bert)
    tokenizer_op = text.BertTokenizer(vocab=vocab)

    with pytest.raises(TypeError) as info:
        _ = tokenizer_op(data)
    assert "Invalid user input. Got <class 'dict'>: {'张三': 18, '王五': 20}, cannot be converted into tensor." in str(
        info)
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path,
                               data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64):
    """Process TNEWS dataset"""
    ### Loading TNEWS from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"],
                              columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate())
        dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0))
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup)
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    dataset = dataset.map(input_columns=["sentence"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len)))
    dataset = dataset.map(input_columns=["sentence"],
                          operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'),
                                                     append=np.array(["[SEP]"], dtype='S')))
    dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup)
    dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0))
    dataset = dataset.batch(batch_size)
    label = []
    text_ids = []
    mask_ids = []
    segment_ids = []
    for data in dataset:
        label.append(data[0])
        text_ids.append(data[1])
        mask_ids.append(data[2])
        segment_ids.append(data[3])
    return label, text_ids, mask_ids, segment_ids
def check_bert_tokenizer_with_offsets(
        first,
        last,
        expect_str,
        expected_offsets_start,
        expected_offsets_limit,
        vocab_list,
        suffix_indicator='##',
        max_bytes_per_token=100,
        unknown_token='[UNK]',
        lower_case=False,
        keep_whitespace=False,
        normalization_form=text.utils.NormalizeForm.NONE,
        preserve_unused_token=False):
    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = text.Vocab.from_list(vocab_list)
    tokenizer_op = text.BertTokenizer(
        vocab=vocab,
        suffix_indicator=suffix_indicator,
        max_bytes_per_token=max_bytes_per_token,
        unknown_token=unknown_token,
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token,
        with_offsets=True)
    dataset = dataset.map(
        operations=tokenizer_op,
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        column_order=['token', 'offsets_start', 'offsets_limit'])
    count = 0
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        token = text.to_str(i['token'])
        logger.info("Out:", token)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(token, expect_str[count])
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count = count + 1
def process_cmnli_clue_dataset(data_dir,
                               label_list,
                               bert_vocab_path,
                               data_usage='train',
                               shuffle_dataset=False,
                               max_seq_len=128,
                               batch_size=64,
                               drop_remainder=True):
    """Process CMNLI dataset"""
    ### Loading CMNLI from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(
            operations=ops.Duplicate(),
            input_columns=["id"],
            output_columns=["id", "label_id"],
            column_order=["id", "label_id", "sentence1", "sentence2"])
        dataset = dataset.map(operations=ops.Fill(0),
                              input_columns=["label_id"])
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(operations=label_lookup,
                              input_columns="label",
                              output_columns="label_id")
    ### Processing sentence pairs
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    ### Tokenizing sentences and truncate sequence pair
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"])
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"])
    dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len -
                                                               3),
                          input_columns=["sentence1", "sentence2"])
    ### Adding special tokens
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence1"])
    dataset = dataset.map(
        operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')),
        input_columns=["sentence2"])
    ### Generating segment_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["sentence1"],
        output_columns=["sentence1", "type_sentence1"],
        column_order=["sentence1", "type_sentence1", "sentence2", "label_id"])
    dataset = dataset.map(operations=ops.Duplicate(),
                          input_columns=["sentence2"],
                          output_columns=["sentence2", "type_sentence2"],
                          column_order=[
                              "sentence1", "type_sentence1", "sentence2",
                              "type_sentence2", "label_id"
                          ])
    dataset = dataset.map(operations=[lookup, ops.Fill(0)],
                          input_columns=["type_sentence1"])
    dataset = dataset.map(operations=[lookup, ops.Fill(1)],
                          input_columns=["type_sentence2"])
    dataset = dataset.map(
        operations=ops.Concatenate(),
        input_columns=["type_sentence1", "type_sentence2"],
        output_columns=["segment_ids"],
        column_order=["sentence1", "sentence2", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["segment_ids"])
    ### Generating text_ids
    dataset = dataset.map(operations=ops.Concatenate(),
                          input_columns=["sentence1", "sentence2"],
                          output_columns=["text_ids"],
                          column_order=["text_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=lookup, input_columns=["text_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["text_ids"])
    ### Generating mask_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["text_ids"],
        output_columns=["text_ids", "mask_ids"],
        column_order=["text_ids", "mask_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["mask_ids"])
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    return dataset