def process_ner_msra_dataset(data_dir,
                             label_list,
                             bert_vocab_path,
                             max_seq_len=128,
                             class_filter=None,
                             split_begin=None,
                             split_end=None):
    """Process MSRA dataset"""
    ### Loading MSRA from CLUEDataset
    dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter,
                                               split_begin, split_end),
                                  column_names=['text', 'label'])

    ### Processing label
    label_vocab = text.Vocab.from_list(label_list)
    label_lookup = text.Lookup(label_vocab)
    dataset = dataset.map(operations=label_lookup,
                          input_columns="label",
                          output_columns="label_ids")
    dataset = dataset.map(
        operations=ops.Concatenate(prepend=np.array([0], dtype='i')),
        input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)),
                          input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["label_ids"])
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    unicode_char_tokenizer = text.UnicodeCharTokenizer()
    dataset = dataset.map(operations=unicode_char_tokenizer,
                          input_columns=["text"],
                          output_columns=["sentence"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=lookup,
                          input_columns=["sentence"],
                          output_columns=["input_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["input_ids"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "input_mask"],
        column_order=["input_ids", "input_mask", "label_ids"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["input_mask"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "segment_ids"],
        column_order=["input_ids", "input_mask", "segment_ids", "label_ids"])
    dataset = dataset.map(operations=ops.Fill(0),
                          input_columns=["segment_ids"])
    return dataset
def test_unicode_char_tokenizer_with_offsets():
    """
    Test UnicodeCharTokenizer
    """
    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
    ], [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
    expected_offsets_limit = [[
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
    ], [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                              [1, 2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is : {}".format(tokens))
    assert split_by_unicode_char(input_strs) == tokens
示例#3
0
def test_unicode_char_tokenizer():
    """
    Test UnicodeCharTokenizer
    """
    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = nlp.UnicodeCharTokenizer()
    dataset = dataset.map(operations=tokenizer)
    tokens = []
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text']).tolist()
        tokens.append(text)
    logger.info("The out tokens is : {}".format(tokens))
    assert split_by_unicode_char(input_strs) == tokens
示例#4
0
def test_demo_basic_from_dataset_with_tokenizer():
    """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer"""
    data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt",
                              shuffle=False)
    data = data.map(input_columns=["text"],
                    operations=text.UnicodeCharTokenizer())
    vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None)
    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
    res = []
    for d in data.create_dict_iterator():
        res.append(list(d["text"]))
    assert res == [[
        13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5
    ], [21, 20, 10, 25, 23, 26], [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5],
                   [2, 2]]