def check_bert_tokenizer(first, last, expect_str, vocab_list, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', lower_case=False, keep_whitespace=False, normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = nlp.Vocab.from_list(vocab_list) tokenizer_op = nlp.BertTokenizer( vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token) dataset = dataset.map(operations=tokenizer_op) count = 0 for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']) logger.info("Out:", text) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(text, expect_str[count]) count = count + 1
def test_bert_tokenizer_callable_invalid_input(): """ Test WordpieceTokenizer in eager mode with invalid input """ data = {'张三': 18, '王五': 20} vocab = text.Vocab.from_list(vocab_bert) tokenizer_op = text.BertTokenizer(vocab=vocab) with pytest.raises(TypeError) as info: _ = tokenizer_op(data) assert "Invalid user input. Got <class 'dict'>: {'张三': 18, '王五': 20}, cannot be converted into tensor." in str( info)
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64): """Process TNEWS dataset""" ### Loading TNEWS from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"], columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0)) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') dataset = dataset.map(input_columns=["sentence"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len))) dataset = dataset.map(input_columns=["sentence"], operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S'))) dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup) dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"], columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0)) dataset = dataset.batch(batch_size) label = [] text_ids = [] mask_ids = [] segment_ids = [] for data in dataset: label.append(data[0]) text_ids.append(data[1]) mask_ids.append(data[2]) segment_ids.append(data[3]) return label, text_ids, mask_ids, segment_ids
def check_bert_tokenizer_with_offsets( first, last, expect_str, expected_offsets_start, expected_offsets_limit, vocab_list, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', lower_case=False, keep_whitespace=False, normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = text.Vocab.from_list(vocab_list) tokenizer_op = text.BertTokenizer( vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True) dataset = dataset.map( operations=tokenizer_op, input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], column_order=['token', 'offsets_start', 'offsets_limit']) count = 0 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): token = text.to_str(i['token']) logger.info("Out:", token) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count = count + 1
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64, drop_remainder=True): """Process CMNLI dataset""" ### Loading CMNLI from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map( operations=ops.Duplicate(), input_columns=["id"], output_columns=["id", "label_id"], column_order=["id", "label_id", "sentence1", "sentence2"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["label_id"]) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_id") ### Processing sentence pairs vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') ### Tokenizing sentences and truncate sequence pair dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"]) dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"]) dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len - 3), input_columns=["sentence1", "sentence2"]) ### Adding special tokens dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence1"]) dataset = dataset.map( operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence2"]) ### Generating segment_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"], column_order=["sentence1", "type_sentence1", "sentence2", "label_id"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"], column_order=[ "sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id" ]) dataset = dataset.map(operations=[lookup, ops.Fill(0)], input_columns=["type_sentence1"]) dataset = dataset.map(operations=[lookup, ops.Fill(1)], input_columns=["type_sentence2"]) dataset = dataset.map( operations=ops.Concatenate(), input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"], column_order=["sentence1", "sentence2", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["segment_ids"]) ### Generating text_ids dataset = dataset.map(operations=ops.Concatenate(), input_columns=["sentence1", "sentence2"], output_columns=["text_ids"], column_order=["text_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=lookup, input_columns=["text_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["text_ids"]) ### Generating mask_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], column_order=["text_ids", "mask_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["mask_ids"]) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset