def test_custom_node_without_label(): data_path = "tests/dataprocessor/corpus_data_without_label.txt" vocab_path = "tests/dataprocessor/vocab.txt" @dataclass class CustomPreprocessor(Node): def __post_init__(self): super().__init__() self.node = lambda x : x pipe = (Corpus("custom") >> CustomPreprocessor() >> Tokenizer("bert", vocab_path) >> DataManager(batch_size=32)) data = pipe.run(data_path) assert len(data) == 1 i = 0 for batch in data: assert type(batch) == list # batch size default is 1 assert len(batch) == 10 # this is batch data assert type(batch[0]) == list # tokens assert len(batch[0]) > 1 i += 1 assert i == 1
def test_custom_node_with_label(): data_path = "tests/dataprocessor/corpus_data.txt" vocab_path = "tests/dataprocessor/vocab.txt" @dataclass class CustomPreprocessor(Node): def __post_init__(self): super().__init__() self.node = lambda x : x pipe = (Corpus("custom") >> CustomPreprocessor() >> Tokenizer("bert", vocab_path) >> DataManager(batch_size=5)) data = pipe.run(data_path) assert len(data) == 2 i = 0 for batch in data: assert type(batch) == tuple batch_x, batch_y = batch # batch size is 5 assert len(batch_x) == 5 assert len(batch_y) == 5 # this is batch data assert type(batch_x[0]) == list # tokens assert len(batch_x[1]) > 1 assert batch_y[0] in ["1", "0"] i += 1 assert i == 2
def test_pipeline_without_label(): data_path = "tests/dataprocessor/corpus_data_without_label.txt" vocab_path = "tests/dataprocessor/vocab.txt" pipe = (Corpus("custom") >> Preprocessor("common") >> Tokenizer("bert", vocab_path) >> DataManager(batch_size=2)) data = pipe.run(data_path) assert len(data) == 5 i = 0 for batch in data: assert type(batch) == list # batch size default is 1 assert len(batch) == 2 # this is batch data assert type(batch[0]) == list # tokens assert len(batch[0]) > 1 i += 1 assert i == 5
def test_normal_without_label(): data_path = "tests/dataprocessor/corpus_data_without_label.txt" vocab_path = "tests/dataprocessor/vocab.txt" corpus = Corpus("custom") preprocessor = Preprocessor("common") tokenizer = Tokenizer("bert", vocab_path) datamanager = DataManager() data = datamanager(tokenizer(preprocessor(corpus(data_path)))) assert len(data) == 10 i = 0 for batch in data: assert type(batch) == list # batch size default is 1 assert len(batch) == 1 # this is batch data assert type(batch[0]) == list # tokens assert len(batch[0]) > 1 i += 1 assert i == 10
def test_bert_chinese_word_tokenizer_normal(): vocab_file = "tests/dataprocessor/vocab.txt" import jieba tk = Tokenizer(name="bert_chinese_word", vocab_file=vocab_file, segmentor=jieba.lcut) text_list = ["我喜欢你,你也喜欢我。"] assert tk.node.vocab_size == 21128 tk.node.build_vocab(text_list) assert tk.node.vocab_size == 21129 assert tk.node.tokenize( text_list[0]) == ["我", "喜欢", "你", ",", "你", "也", "喜欢", "我", "。"] ids1 = tk(text_list) ids2 = tk(text_list[0]) ids3 = tk.node.encode(text_list[0]) assert ids1[0] == ids2 assert ids1[0] == ids3
def test_functional_pipeline_without_label(): data_path = "tests/dataprocessor/corpus_data_without_label.txt" vocab_path = "tests/dataprocessor/vocab.txt" corpus = Corpus("custom") preprocessor = Preprocessor("common") tokenizer = Tokenizer("bert", vocab_path) datamanager = DataManager(batch_size=2) pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager) data = pipe(data_path) assert len(data) == 5 i = 0 for batch in data: assert type(batch) == list # batch size default is 1 assert len(batch) == 2 # this is batch data assert type(batch[0]) == list # tokens assert len(batch[0]) > 1 i += 1 assert i == 5
def test_pipeline_with_label(): data_path = "tests/dataprocessor/corpus_data.txt" vocab_path = "tests/dataprocessor/vocab.txt" pipe = (Corpus("custom") >> Preprocessor("common") >> Tokenizer("bert", vocab_path) >> DataManager(batch_size=3, drop_last=True)) data = pipe.run(data_path) assert len(data) == 3 i = 0 for batch in data: assert type(batch) == tuple batch_x, batch_y = batch # batch size is 2 assert len(batch_x) == 3 assert len(batch_y) == 3 # this is batch data assert type(batch_x[0]) == list # tokens assert len(batch_x[1]) > 1 assert batch_y[0] in ["1", "0"] i += 1 assert i == 3
def test_normal_with_label(): data_path = "tests/dataprocessor/corpus_data.txt" vocab_path = "tests/dataprocessor/vocab.txt" corpus = Corpus("custom") preprocessor = Preprocessor("common") tokenizer = Tokenizer("bert", vocab_path) datamanager = DataManager(batch_size=2) data = datamanager(tokenizer(preprocessor(corpus(data_path)))) assert len(data) == 5 i = 0 for batch in data: assert type(batch) == tuple batch_x, batch_y = batch # batch size is 2 assert len(batch_x) == 2 assert len(batch_y) == 2 # this is batch data assert type(batch_x[0]) == list # tokens assert len(batch_x[1]) > 1 assert batch_y[0] in ["1", "0"] i += 1 assert i == 5
def test_functional_pipeline_with_label(): data_path = "tests/dataprocessor/corpus_data.txt" vocab_path = "tests/dataprocessor/vocab.txt" corpus = Corpus("custom") preprocessor = Preprocessor("common") tokenizer = Tokenizer("bert", vocab_path) datamanager = DataManager(batch_size=3, drop_last=True) pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager) data = pipe(data_path) assert len(data) == 3 i = 0 for batch in data: assert type(batch) == tuple batch_x, batch_y = batch # batch size is 2 assert len(batch_x) == 3 assert len(batch_y) == 3 # this is batch data assert type(batch_x[0]) == list # tokens assert len(batch_x[1]) > 1 assert batch_y[0] in ["1", "0"] i += 1 assert i == 3
def test_pretrained_processor_input_dataloader(): pp = PretrainedBasicProcessor() batch_size = 10 seq_len = 32 data_path = "tests/dataprocessor/corpus_data_without_label.txt" vocab_path = "tests/dataprocessor/vocab.txt" pipe = (Corpus("custom") >> Preprocessor("common") >> Tokenizer( "bert", vocab_path) >> DataManager(batch_size=batch_size)) data = pipe.run(data_path) outputs = pp(data) for output in outputs: print(output) assert len(output) == 4 assert output["input_ids"].shape == torch.Size([batch_size, seq_len]) assert output["attention_mask"].shape == torch.Size( [batch_size, seq_len]) assert output["token_type_ids"].shape == torch.Size( [batch_size, seq_len]) assert output["position_ids"].shape == torch.Size( [batch_size, seq_len])
def test_tokenizer_single_text(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) text = "我爱你。" assert tokenizer.node.tokenize(text) == ["我", "爱", "你", "。"] assert tokenizer(text) == [101, 2769, 4263, 872, 511, 102]
def test_tokenizer_single_element_with_label(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) texts = [("我爱你。", "1")] assert tokenizer(texts) == [([101, 2769, 4263, 872, 511, 102], "1")]
def test_tokenizer_single_element(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) texts = ["我爱你。"] assert tokenizer(texts) == [[101, 2769, 4263, 872, 511, 102]]
def test_tokenizer_multiple_texts_with_labels(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) texts = [("我爱你。", 2), ("我爱你。", 2)] assert tokenizer(texts) == [([101, 2769, 4263, 872, 511, 102], 2), ([101, 2769, 4263, 872, 511, 102], 2)]
def test_tokenizer_multiple_texts(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) texts = ["我爱你。", "我爱你。"] assert tokenizer(texts) == [[101, 2769, 4263, 872, 511, 102], [101, 2769, 4263, 872, 511, 102]]
def test_tokenizer_single_text_with_label(): vocab_file = "tests/dataprocessor/vocab.txt" tokenizer = Tokenizer("bert", vocab_file) text = ("我爱你。", 1) assert tokenizer.node.tokenize(text[0]) == ["我", "爱", "你", "。"] assert tokenizer(text) == ([101, 2769, 4263, 872, 511, 102], 1)