Python Tokenizer示例，hnlp.dataprocessor.tokenizer.Tokenizer Python示例

示例#1

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_custom_node_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"

    @dataclass
    class CustomPreprocessor(Node):

        def __post_init__(self):
            super().__init__()
            self.node = lambda x : x

    pipe = (Corpus("custom") >>
            CustomPreprocessor() >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=32))
    data = pipe.run(data_path)
    assert len(data) == 1
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 10
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 1

示例#2

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_custom_node_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"

    @dataclass
    class CustomPreprocessor(Node):

        def __post_init__(self):
            super().__init__()
            self.node = lambda x : x

    pipe = (Corpus("custom") >>
            CustomPreprocessor() >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=5))
    data = pipe.run(data_path)
    assert len(data) == 2
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 5
        assert len(batch_x) == 5
        assert len(batch_y) == 5
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 2

示例#3

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_pipeline_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >>
            Preprocessor("common") >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=2))
    data = pipe.run(data_path)
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 2
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 5

示例#4

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_normal_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager()

    data = datamanager(tokenizer(preprocessor(corpus(data_path))))
    assert len(data) == 10
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 1
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 10

示例#5

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_bert_chinese_word_tokenizer_normal():
    vocab_file = "tests/dataprocessor/vocab.txt"
    import jieba
    tk = Tokenizer(name="bert_chinese_word",
                   vocab_file=vocab_file,
                   segmentor=jieba.lcut)
    text_list = ["我喜欢你，你也喜欢我。"]
    assert tk.node.vocab_size == 21128

    tk.node.build_vocab(text_list)
    assert tk.node.vocab_size == 21129

    assert tk.node.tokenize(
        text_list[0]) == ["我", "喜欢", "你", "，", "你", "也", "喜欢", "我", "。"]

    ids1 = tk(text_list)
    ids2 = tk(text_list[0])
    ids3 = tk.node.encode(text_list[0])

    assert ids1[0] == ids2
    assert ids1[0] == ids3

示例#6

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_functional_pipeline_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=2)
    pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager)
    data = pipe(data_path)
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 2
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 5

示例#7

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_pipeline_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >>
            Preprocessor("common") >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=3, drop_last=True))
    data = pipe.run(data_path)
    assert len(data) == 3
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 3
        assert len(batch_y) == 3
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 3

示例#8

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_normal_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=2)

    data = datamanager(tokenizer(preprocessor(corpus(data_path))))
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 2
        assert len(batch_y) == 2
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 5

示例#9

0

显示文件

文件： test_dataprocessor.py 项目： hscspring/hnlp

def test_functional_pipeline_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=3, drop_last=True)
    pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager)
    data = pipe(data_path)
    assert len(data) == 3
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 3
        assert len(batch_y) == 3
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 3

示例#10

0

显示文件

文件： test_processor.py 项目： hscspring/hnlp

def test_pretrained_processor_input_dataloader():
    pp = PretrainedBasicProcessor()

    batch_size = 10
    seq_len = 32

    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >> Preprocessor("common") >> Tokenizer(
        "bert", vocab_path) >> DataManager(batch_size=batch_size))
    data = pipe.run(data_path)

    outputs = pp(data)

    for output in outputs:
        print(output)
        assert len(output) == 4
        assert output["input_ids"].shape == torch.Size([batch_size, seq_len])
        assert output["attention_mask"].shape == torch.Size(
            [batch_size, seq_len])
        assert output["token_type_ids"].shape == torch.Size(
            [batch_size, seq_len])
        assert output["position_ids"].shape == torch.Size(
            [batch_size, seq_len])

示例#11

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_single_text():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    text = "我爱你。"
    assert tokenizer.node.tokenize(text) == ["我", "爱", "你", "。"]
    assert tokenizer(text) == [101, 2769, 4263, 872, 511, 102]

示例#12

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_single_element_with_label():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    texts = [("我爱你。", "1")]
    assert tokenizer(texts) == [([101, 2769, 4263, 872, 511, 102], "1")]

示例#13

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_single_element():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    texts = ["我爱你。"]
    assert tokenizer(texts) == [[101, 2769, 4263, 872, 511, 102]]

示例#14

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_multiple_texts_with_labels():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    texts = [("我爱你。", 2), ("我爱你。", 2)]
    assert tokenizer(texts) == [([101, 2769, 4263, 872, 511, 102], 2),
                                ([101, 2769, 4263, 872, 511, 102], 2)]

示例#15

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_multiple_texts():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    texts = ["我爱你。", "我爱你。"]
    assert tokenizer(texts) == [[101, 2769, 4263, 872, 511, 102],
                                [101, 2769, 4263, 872, 511, 102]]

示例#16

0

显示文件

文件： test_tokenizer.py 项目： hscspring/hnlp

def test_tokenizer_single_text_with_label():
    vocab_file = "tests/dataprocessor/vocab.txt"
    tokenizer = Tokenizer("bert", vocab_file)
    text = ("我爱你。", 1)
    assert tokenizer.node.tokenize(text[0]) == ["我", "爱", "你", "。"]
    assert tokenizer(text) == ([101, 2769, 4263, 872, 511, 102], 1)