def test_normalize(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("My Name Is John") assert output == "my name is john"
def test_strip_accents(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = BertNormalizer( strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False ) output = tokenizer.normalize("Héllò") assert output == "Hello"
def test_clean_text(self): tokenizer = Tokenizer(BPE()) tokenizer.normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True) output = tokenizer.normalize("\ufeffHello") assert output == "Hello"
''' 将句子中的中文和英文分开,使用huggingface/tokenizers https://github.com/huggingface/tokenizers/blob/master/bindings/python/tests/bindings/test_normalizers.py ''' from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.normalizers import BertNormalizer text = "薛定谔的猫(英文名称:Erwin Schrödinger's Cat)是奥地利著名物理学家薛定谔" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False) output = tokenizer.normalize(txt) print(output)
def test_full_strip(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Strip(left=True, right=True) output = tokenizer.normalize(" hello ") assert output == "hello"
def test_lowercase(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("HELLO") assert output == "hello"
def test_can_make_sequences(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence([Lowercase(), Strip()]) output = tokenizer.normalize(" HELLO ") assert output == "hello"
def test_right_strip(self): tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Strip(left=False, right=True) output = tokenizer.normalize(" hello ") assert output == " hello"