def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None, split_begin=None, split_end=None): """Process MSRA dataset""" ### Loading MSRA from CLUEDataset dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end), column_names=['text', 'label']) ### Processing label label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_ids") dataset = dataset.map( operations=ops.Concatenate(prepend=np.array([0], dtype='i')), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["label_ids"]) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) lookup = text.Lookup(vocab, unknown_token='[UNK]') unicode_char_tokenizer = text.UnicodeCharTokenizer() dataset = dataset.map(operations=unicode_char_tokenizer, input_columns=["text"], output_columns=["sentence"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)), input_columns=["sentence"]) dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence"]) dataset = dataset.map(operations=lookup, input_columns=["sentence"], output_columns=["input_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["input_ids"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "input_mask"], column_order=["input_ids", "input_mask", "label_ids"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["input_mask"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "segment_ids"], column_order=["input_ids", "input_mask", "segment_ids", "label_ids"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["segment_ids"]) return dataset
def test_from_list_lookup_empty_string(): # "" is a valid word in vocab, which can be looked up by LookupOp vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", ""], True) lookup = text.Lookup(vocab, "") data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) ind = 0 res = [2, 1, 4, 5, 6, 7] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1 # unknown_token of LookUp is None, it will convert to std::nullopt in C++, # so it has nothing to do with "" in vocab and C++ will skip looking up unknown_token vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", ""], True) lookup = text.Lookup(vocab) data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) try: for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True): pass except RuntimeError as e: assert "token: \"is\" doesn't exist in vocab and no unknown token is specified" in str( e)
def test_config(lookup_str, data_type=None): try: vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) # if data_type is None, test the default value of data_type op = text.Lookup(vocab, "<unk>") if data_type is None else text.Lookup(vocab, "<unk>", data_type) data = data.map(operations=op, input_columns=["text"]) res = [] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(d["text"]) return res[0].dtype except (ValueError, RuntimeError, TypeError) as e: return str(e)
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64): """Process TNEWS dataset""" ### Loading TNEWS from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"], columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0)) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') dataset = dataset.map(input_columns=["sentence"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len))) dataset = dataset.map(input_columns=["sentence"], operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S'))) dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup) dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"], columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0)) dataset = dataset.batch(batch_size) label = [] text_ids = [] mask_ids = [] segment_ids = [] for data in dataset: label.append(data[0]) text_ids.append(data[1]) mask_ids.append(data[2]) segment_ids.append(data[3]) return label, text_ids, mask_ids, segment_ids
def test_lookup_callable(): """ Test lookup is callable """ logger.info("test_lookup_callable") vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) lookup = text.Lookup(vocab) word = "迎" assert lookup(word) == 3
def test_from_dict_tutorial(): vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6}) lookup = text.Lookup(vocab, 6) # default value is -1 data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(input_columns=["text"], operations=lookup) res = [3, 6, 2, 4, 5, 6] ind = 0 for d in data.create_dict_iterator(): assert d["text"] == res[ind], ind ind += 1
def test_config(texts, top_k, special_tokens, special_first): corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k, special_tokens, special_first) data = ds.GeneratorDataset(gen_input(texts), column_names=["text"]) data = data.map(input_columns="text", operations=text.Lookup(vocab)) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) return res
def test_demo_basic_from_dataset(): """ this is a tutorial on how from_dataset should be used in a normal use case""" data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None) data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) assert res == [4, 5, 3, 6, 7, 2]
def test_from_list(): vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" ")) lookup = text.Lookup(vocab) data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(input_columns=["text"], operations=lookup) ind = 0 res = [2, 1, 4, 5, 6, 7] for d in data.create_dict_iterator(): assert d["text"] == res[ind], ind ind += 1
def test_from_file(): vocab = text.Vocab.from_file(VOCAB_FILE, ",") lookup = text.Lookup(vocab) data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(input_columns=["text"], operations=lookup) ind = 0 res = [10, 11, 12, 15, 13, 14] for d in data.create_dict_iterator(): assert d["text"] == res[ind], ind ind += 1
def test_from_dict_tutorial(): vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6}) lookup = text.Lookup(vocab, "<unk>") # any unknown token will be mapped to the id of <unk> data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) res = [3, 6, 2, 4, 5, 6] ind = 0 for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1
def test_config(freq_range, top_k): corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k) corpus_dataset = corpus_dataset.map(input_columns="text", operations=text.Lookup(vocab)) res = [] for d in corpus_dataset.create_dict_iterator(): res.append(list(d["text"])) return res
def test_from_file_tutorial(): vocab = text.Vocab.from_file(VOCAB_FILE, ",", None, ["<pad>", "<unk>"], True) lookup = text.Lookup(vocab) data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) ind = 0 res = [10, 11, 12, 15, 13, 14] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1
def test_from_list_tutorial(): vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", "<unk>"], True) lookup = text.Lookup(vocab, "<unk>") data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) ind = 0 res = [2, 1, 4, 5, 6, 7] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1
def test_config(lookup_str, vocab_input, special_tokens, special_first): try: vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) return res except ValueError as e: return str(e)
def test_config(lookup_str, vocab_input, special_tokens, special_first, unknown_token): try: vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) data = data.map(operations=text.Lookup(vocab, unknown_token), input_columns=["text"]) res = [] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(d["text"].item()) return res except (ValueError, RuntimeError, TypeError) as e: return str(e)
def test_config(lookup_str, vocab_size, special_tokens, special_first): try: vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens, special_first=special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) data = data.map(operations=text.Lookup(vocab, "s2"), input_columns=["text"]) res = [] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(d["text"].item()) return res except ValueError as e: return str(e)
def test_on_tokenized_line(): data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False) jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) with open(VOCAB_FILE, 'r') as f: for line in f: word = line.split(',')[0] jieba_op.add_word(word) data = data.map(operations=jieba_op, input_columns=["text"]) vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"]) lookup = text.Lookup(vocab, "<unk>") data = data.map(operations=lookup, input_columns=["text"]) res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14], [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32) for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): np.testing.assert_array_equal(d["text"], res[i])
def test_demo_basic_from_dataset_with_tokenizer(): """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer""" data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt", shuffle=False) data = data.map(input_columns=["text"], operations=text.UnicodeCharTokenizer()) vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None) data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) res = [] for d in data.create_dict_iterator(): res.append(list(d["text"])) assert res == [[ 13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5 ], [21, 20, 10, 25, 23, 26], [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5], [2, 2]]
def test_config(freq_range, top_k): corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k, special_tokens=["<pad>", "<unk>"], special_first=True) corpus_dataset = corpus_dataset.map(operations=text.Lookup( vocab, "<unk>"), input_columns="text") res = [] for d in corpus_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(list(d["text"])) return res
def test_demo_basic_from_dataset(): """ this is a tutorial on how from_dataset should be used in a normal use case""" data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None, special_tokens=["<pad>", "<unk>"], special_first=True) data = data.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"]) res = [] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(d["text"].item()) assert res == [4, 5, 3, 6, 7, 2], res
def test_on_tokenized_line_with_no_special_tokens(): data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False) jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP) with open(VOCAB_FILE, 'r') as f: for line in f: word = line.split(',')[0] jieba_op.add_word(word) data = data.map(input_columns=["text"], operations=jieba_op) vocab = text.Vocab.from_file(VOCAB_FILE, ",") lookup = text.Lookup(vocab, "not") data = data.map(input_columns=["text"], operations=lookup) res = np.array([[8, 0, 9, 0, 10, 0, 13, 0, 11, 0, 12], [9, 0, 10, 0, 8, 0, 12, 0, 11, 0, 13]], dtype=np.int32) for i, d in enumerate(data.create_dict_iterator()): np.testing.assert_array_equal(d["text"], res[i])
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64, drop_remainder=True): """Process CMNLI dataset""" ### Loading CMNLI from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map( operations=ops.Duplicate(), input_columns=["id"], output_columns=["id", "label_id"], column_order=["id", "label_id", "sentence1", "sentence2"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["label_id"]) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_id") ### Processing sentence pairs vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') ### Tokenizing sentences and truncate sequence pair dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"]) dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"]) dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len - 3), input_columns=["sentence1", "sentence2"]) ### Adding special tokens dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence1"]) dataset = dataset.map( operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence2"]) ### Generating segment_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"], column_order=["sentence1", "type_sentence1", "sentence2", "label_id"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"], column_order=[ "sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id" ]) dataset = dataset.map(operations=[lookup, ops.Fill(0)], input_columns=["type_sentence1"]) dataset = dataset.map(operations=[lookup, ops.Fill(1)], input_columns=["type_sentence2"]) dataset = dataset.map( operations=ops.Concatenate(), input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"], column_order=["sentence1", "sentence2", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["segment_ids"]) ### Generating text_ids dataset = dataset.map(operations=ops.Concatenate(), input_columns=["sentence1", "sentence2"], output_columns=["text_ids"], column_order=["text_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=lookup, input_columns=["text_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["text_ids"]) ### Generating mask_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], column_order=["text_ids", "mask_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["mask_ids"]) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset