def test_full_tokenizer(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "." ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "." ], )
def run(self): """ CORPUS/xxx.txt, DATA/xxx.vocab, DATA/xxx.model -> DATA/xxx.txt """ vocab_file = os.path.join(ACE_ROOT, '%s.vocab' % self.config.model_prefix) model_file = os.path.join(ACE_ROOT, '%s.model' % self.config.model_prefix) self.__create_text() if not os.path.isfile(vocab_file) or not os.path.isfile(model_file): self.__create_vocab() tokenizer = AlbertTokenizer(vocab_file=vocab_file, model_file=model_file, do_lower_case=self.config.do_lower_case, remove_space=self.config.remove_space, keep_accents=self.config.keep_accents, bos_token=self.config.bos_token, eos_token=self.config.eos_token, unk_token=self.config.unk_token, sep_token=self.config.sep_token, pad_token=self.config.pad_token, cls_token=self.config.cls_token, mask_token=self.config.mask_token) for text_file in tqdm(FileUtil.file_list(self.config.corpus_dir), desc='create pretraining data files'): if text_file.endswith('.txt'): data_file = os.path.join(self.config.data_dir, os.path.basename(text_file)) with open(text_file, 'r') as f, open(data_file, 'w') as fw: for line in f.read().splitlines(): tokens = tokenizer.tokenize(line) fw.write(' '.join(tokens) + '\n')
def test_full_tokenizer(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.' ])