def process_ner_msra_dataset(data_dir,
                             label_list,
                             bert_vocab_path,
                             max_seq_len=128,
                             class_filter=None,
                             split_begin=None,
                             split_end=None):
    """Process MSRA dataset"""
    ### Loading MSRA from CLUEDataset
    dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter,
                                               split_begin, split_end),
                                  column_names=['text', 'label'])

    ### Processing label
    label_vocab = text.Vocab.from_list(label_list)
    label_lookup = text.Lookup(label_vocab)
    dataset = dataset.map(operations=label_lookup,
                          input_columns="label",
                          output_columns="label_ids")
    dataset = dataset.map(
        operations=ops.Concatenate(prepend=np.array([0], dtype='i')),
        input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)),
                          input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["label_ids"])
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    unicode_char_tokenizer = text.UnicodeCharTokenizer()
    dataset = dataset.map(operations=unicode_char_tokenizer,
                          input_columns=["text"],
                          output_columns=["sentence"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=lookup,
                          input_columns=["sentence"],
                          output_columns=["input_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["input_ids"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "input_mask"],
        column_order=["input_ids", "input_mask", "label_ids"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["input_mask"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "segment_ids"],
        column_order=["input_ids", "input_mask", "segment_ids", "label_ids"])
    dataset = dataset.map(operations=ops.Fill(0),
                          input_columns=["segment_ids"])
    return dataset
示例#2
0
def test_from_list_lookup_empty_string():
    # "" is a valid word in vocab, which can be looked up by LookupOp
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "),
                                 ["<pad>", ""], True)
    lookup = text.Lookup(vocab, "")
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    ind = 0
    res = [2, 1, 4, 5, 6, 7]
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1

    # unknown_token of LookUp is None, it will convert to std::nullopt in C++,
    # so it has nothing to do with "" in vocab and C++ will skip looking up unknown_token
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "),
                                 ["<pad>", ""], True)
    lookup = text.Lookup(vocab)
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    try:
        for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass
    except RuntimeError as e:
        assert "token: \"is\" doesn't exist in vocab and no unknown token is specified" in str(
            e)
示例#3
0
 def test_config(lookup_str, data_type=None):
     try:
         vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True)
         data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
         # if data_type is None, test the default value of data_type
         op = text.Lookup(vocab, "<unk>") if data_type is None else text.Lookup(vocab, "<unk>", data_type)
         data = data.map(operations=op, input_columns=["text"])
         res = []
         for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
             res.append(d["text"])
         return res[0].dtype
     except (ValueError, RuntimeError, TypeError) as e:
         return str(e)
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path,
                               data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64):
    """Process TNEWS dataset"""
    ### Loading TNEWS from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"],
                              columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate())
        dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0))
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup)
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    dataset = dataset.map(input_columns=["sentence"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len)))
    dataset = dataset.map(input_columns=["sentence"],
                          operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'),
                                                     append=np.array(["[SEP]"], dtype='S')))
    dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup)
    dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0))
    dataset = dataset.batch(batch_size)
    label = []
    text_ids = []
    mask_ids = []
    segment_ids = []
    for data in dataset:
        label.append(data[0])
        text_ids.append(data[1])
        mask_ids.append(data[2])
        segment_ids.append(data[3])
    return label, text_ids, mask_ids, segment_ids
示例#5
0
def test_lookup_callable():
    """
    Test lookup is callable
    """
    logger.info("test_lookup_callable")
    vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您'])
    lookup = text.Lookup(vocab)
    word = "迎"
    assert lookup(word) == 3
示例#6
0
def test_from_dict_tutorial():
    vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
    lookup = text.Lookup(vocab, 6)  # default value is -1
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(input_columns=["text"], operations=lookup)
    res = [3, 6, 2, 4, 5, 6]
    ind = 0
    for d in data.create_dict_iterator():
        assert d["text"] == res[ind], ind
        ind += 1
示例#7
0
 def test_config(texts, top_k, special_tokens, special_first):
     corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
     vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k,
                                     special_tokens, special_first)
     data = ds.GeneratorDataset(gen_input(texts), column_names=["text"])
     data = data.map(input_columns="text", operations=text.Lookup(vocab))
     res = []
     for d in data.create_dict_iterator():
         res.append(d["text"].item())
     return res
示例#8
0
def test_demo_basic_from_dataset():
    """ this is a tutorial on how from_dataset should be used in a normal use case"""
    data = ds.TextFileDataset("../data/dataset/testVocab/words.txt",
                              shuffle=False)
    vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None)
    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
    res = []
    for d in data.create_dict_iterator():
        res.append(d["text"].item())
    assert res == [4, 5, 3, 6, 7, 2]
示例#9
0
def test_from_list():
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "))
    lookup = text.Lookup(vocab)
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(input_columns=["text"], operations=lookup)
    ind = 0
    res = [2, 1, 4, 5, 6, 7]
    for d in data.create_dict_iterator():
        assert d["text"] == res[ind], ind
        ind += 1
示例#10
0
def test_from_file():
    vocab = text.Vocab.from_file(VOCAB_FILE, ",")
    lookup = text.Lookup(vocab)
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(input_columns=["text"], operations=lookup)
    ind = 0
    res = [10, 11, 12, 15, 13, 14]
    for d in data.create_dict_iterator():
        assert d["text"] == res[ind], ind
        ind += 1
示例#11
0
def test_from_dict_tutorial():
    vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
    lookup = text.Lookup(vocab, "<unk>")  # any unknown token will be mapped to the id of <unk>
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    res = [3, 6, 2, 4, 5, 6]
    ind = 0
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1
示例#12
0
 def test_config(freq_range, top_k):
     corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
     vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range,
                                     top_k)
     corpus_dataset = corpus_dataset.map(input_columns="text",
                                         operations=text.Lookup(vocab))
     res = []
     for d in corpus_dataset.create_dict_iterator():
         res.append(list(d["text"]))
     return res
示例#13
0
def test_from_file_tutorial():
    vocab = text.Vocab.from_file(VOCAB_FILE, ",", None, ["<pad>", "<unk>"], True)
    lookup = text.Lookup(vocab)
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    ind = 0
    res = [10, 11, 12, 15, 13, 14]
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1
示例#14
0
def test_from_list_tutorial():
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", "<unk>"], True)
    lookup = text.Lookup(vocab, "<unk>")
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    ind = 0
    res = [2, 1, 4, 5, 6, 7]
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1
示例#15
0
 def test_config(lookup_str, vocab_input, special_tokens, special_first):
     try:
         vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first)
         data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
         data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
         res = []
         for d in data.create_dict_iterator():
             res.append(d["text"].item())
         return res
     except ValueError as e:
         return str(e)
示例#16
0
 def test_config(lookup_str, vocab_input, special_tokens, special_first, unknown_token):
     try:
         vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first)
         data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
         data = data.map(operations=text.Lookup(vocab, unknown_token), input_columns=["text"])
         res = []
         for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
             res.append(d["text"].item())
         return res
     except (ValueError, RuntimeError, TypeError) as e:
         return str(e)
示例#17
0
 def test_config(lookup_str, vocab_size, special_tokens, special_first):
     try:
         vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens,
                                      special_first=special_first)
         data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
         data = data.map(operations=text.Lookup(vocab, "s2"), input_columns=["text"])
         res = []
         for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
             res.append(d["text"].item())
         return res
     except ValueError as e:
         return str(e)
示例#18
0
def test_on_tokenized_line():
    data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt", shuffle=False)
    jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
    with open(VOCAB_FILE, 'r') as f:
        for line in f:
            word = line.split(',')[0]
            jieba_op.add_word(word)
    data = data.map(operations=jieba_op, input_columns=["text"])
    vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["<pad>", "<unk>"])
    lookup = text.Lookup(vocab, "<unk>")
    data = data.map(operations=lookup, input_columns=["text"])
    res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14],
                    [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32)
    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(d["text"], res[i])
示例#19
0
def test_demo_basic_from_dataset_with_tokenizer():
    """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer"""
    data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt",
                              shuffle=False)
    data = data.map(input_columns=["text"],
                    operations=text.UnicodeCharTokenizer())
    vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None)
    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
    res = []
    for d in data.create_dict_iterator():
        res.append(list(d["text"]))
    assert res == [[
        13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5
    ], [21, 20, 10, 25, 23, 26], [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5],
                   [2, 2]]
示例#20
0
 def test_config(freq_range, top_k):
     corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
     vocab = text.Vocab.from_dataset(corpus_dataset,
                                     None,
                                     freq_range,
                                     top_k,
                                     special_tokens=["<pad>", "<unk>"],
                                     special_first=True)
     corpus_dataset = corpus_dataset.map(operations=text.Lookup(
         vocab, "<unk>"),
                                         input_columns="text")
     res = []
     for d in corpus_dataset.create_dict_iterator(num_epochs=1,
                                                  output_numpy=True):
         res.append(list(d["text"]))
     return res
示例#21
0
def test_demo_basic_from_dataset():
    """ this is a tutorial on how from_dataset should be used in a normal use case"""
    data = ds.TextFileDataset("../data/dataset/testVocab/words.txt",
                              shuffle=False)
    vocab = text.Vocab.from_dataset(data,
                                    "text",
                                    freq_range=None,
                                    top_k=None,
                                    special_tokens=["<pad>", "<unk>"],
                                    special_first=True)
    data = data.map(operations=text.Lookup(vocab, "<unk>"),
                    input_columns=["text"])
    res = []
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        res.append(d["text"].item())
    assert res == [4, 5, 3, 6, 7, 2], res
示例#22
0
def test_on_tokenized_line_with_no_special_tokens():
    data = ds.TextFileDataset("../data/dataset/testVocab/lines.txt",
                              shuffle=False)
    jieba_op = text.JiebaTokenizer(HMM_FILE, MP_FILE, mode=text.JiebaMode.MP)
    with open(VOCAB_FILE, 'r') as f:
        for line in f:
            word = line.split(',')[0]
            jieba_op.add_word(word)

    data = data.map(input_columns=["text"], operations=jieba_op)
    vocab = text.Vocab.from_file(VOCAB_FILE, ",")
    lookup = text.Lookup(vocab, "not")
    data = data.map(input_columns=["text"], operations=lookup)
    res = np.array([[8, 0, 9, 0, 10, 0, 13, 0, 11, 0, 12],
                    [9, 0, 10, 0, 8, 0, 12, 0, 11, 0, 13]],
                   dtype=np.int32)
    for i, d in enumerate(data.create_dict_iterator()):
        np.testing.assert_array_equal(d["text"], res[i])
def process_cmnli_clue_dataset(data_dir,
                               label_list,
                               bert_vocab_path,
                               data_usage='train',
                               shuffle_dataset=False,
                               max_seq_len=128,
                               batch_size=64,
                               drop_remainder=True):
    """Process CMNLI dataset"""
    ### Loading CMNLI from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(
            operations=ops.Duplicate(),
            input_columns=["id"],
            output_columns=["id", "label_id"],
            column_order=["id", "label_id", "sentence1", "sentence2"])
        dataset = dataset.map(operations=ops.Fill(0),
                              input_columns=["label_id"])
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(operations=label_lookup,
                              input_columns="label",
                              output_columns="label_id")
    ### Processing sentence pairs
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    ### Tokenizing sentences and truncate sequence pair
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"])
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"])
    dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len -
                                                               3),
                          input_columns=["sentence1", "sentence2"])
    ### Adding special tokens
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence1"])
    dataset = dataset.map(
        operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')),
        input_columns=["sentence2"])
    ### Generating segment_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["sentence1"],
        output_columns=["sentence1", "type_sentence1"],
        column_order=["sentence1", "type_sentence1", "sentence2", "label_id"])
    dataset = dataset.map(operations=ops.Duplicate(),
                          input_columns=["sentence2"],
                          output_columns=["sentence2", "type_sentence2"],
                          column_order=[
                              "sentence1", "type_sentence1", "sentence2",
                              "type_sentence2", "label_id"
                          ])
    dataset = dataset.map(operations=[lookup, ops.Fill(0)],
                          input_columns=["type_sentence1"])
    dataset = dataset.map(operations=[lookup, ops.Fill(1)],
                          input_columns=["type_sentence2"])
    dataset = dataset.map(
        operations=ops.Concatenate(),
        input_columns=["type_sentence1", "type_sentence2"],
        output_columns=["segment_ids"],
        column_order=["sentence1", "sentence2", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["segment_ids"])
    ### Generating text_ids
    dataset = dataset.map(operations=ops.Concatenate(),
                          input_columns=["sentence1", "sentence2"],
                          output_columns=["text_ids"],
                          column_order=["text_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=lookup, input_columns=["text_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["text_ids"])
    ### Generating mask_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["text_ids"],
        output_columns=["text_ids", "mask_ids"],
        column_order=["text_ids", "mask_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["mask_ids"])
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    return dataset