def test_space(self): # for example, character models treat space as a symbol dict_file = io.StringIO(" 999\n" "a 999\n" "b 999\n") d = Dictionary() d.add_from_file(dict_file) self.assertEqual(d.index(" "), 4) self.assertEqual(d.index("a"), 5) self.assertEqual(d.index("b"), 6)
def test_character_token_embedder(self): vocab = Dictionary() vocab.add_symbol('hello') vocab.add_symbol('there') embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2) test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']] max_len = max(len(s) for s in test_sents) input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad()) for i in range(len(test_sents)): input[i][0] = vocab.eos() for j in range(len(test_sents[i])): input[i][j + 1] = vocab.index(test_sents[i][j]) input[i][j + 2] = vocab.eos() embs = embedder(input) assert embs.size() == (len(test_sents), max_len + 2, 5) self.assertAlmostEqual(embs[0][0], embs[1][0]) self.assertAlmostEqual(embs[0][0], embs[0][-1]) self.assertAlmostEqual(embs[0][1], embs[2][1]) self.assertAlmostEqual(embs[0][3], embs[1][1]) embs.sum().backward() assert embedder.char_embeddings.weight.grad is not None
def _get_test_data(self): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor([i + 1 for i in src_len])
def _get_test_data(self, append_eos=True): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor(src_len)
def setup_task(cls, args, **kwargs): # Here we can perform any setup required for the task. This may include # loading Dictionaries, initializing shared Embedding layers, etc. # In this case we'll just load the Dictionaries. reloaded = torch.load(args.xlmr_model_dict) params = AttrDict(reloaded['params']) # build dictionary / update parameters input_vocab = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(input_vocab) params.bos_index = input_vocab.index(BOS_WORD) params.eos_index = input_vocab.index(EOS_WORD) params.pad_index = input_vocab.index(PAD_WORD) params.unk_index = input_vocab.index(UNK_WORD) params.mask_index = input_vocab.index(MASK_WORD) label_vocab = Dictionary.load(os.path.join(args.data, 'dict.label.txt')) print('| [input] dictionary: {} types'.format(len(input_vocab))) print('| [label] dictionary: {} types'.format(len(label_vocab))) return SemparseSeq2SeqTask(args, input_vocab, label_vocab)
def _convert_src_tokens_to_tensor( self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool ): src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return x, torch.LongTensor(src_len)
def test_overwrite(self): # for example, Camembert overwrites <unk>, <s> and </s> dict_file = io.StringIO("<unk> 999 #fairseq:overwrite\n" "<s> 999 #fairseq:overwrite\n" "</s> 999 #fairseq:overwrite\n" ", 999\n" "▁de 999\n") d = Dictionary() d.add_from_file(dict_file) self.assertEqual(d.index("<pad>"), 1) self.assertEqual(d.index("foo"), 3) self.assertEqual(d.index("<unk>"), 4) self.assertEqual(d.index("<s>"), 5) self.assertEqual(d.index("</s>"), 6) self.assertEqual(d.index(","), 7) self.assertEqual(d.index("▁de"), 8)
def test_add_file_to_dict(self): counts = {} num_lines = 100 per_line = 10 with tempfile.TemporaryDirectory("test_sampling") as data_dir: filename = os.path.join(data_dir, "dummy.txt") with open(filename, "w", encoding="utf-8") as data: for c in string.ascii_letters: line = f"{c} " * per_line for _ in range(num_lines): data.write(f"{line}\n") counts[c] = per_line * num_lines per_line += 5 dict = Dictionary() Dictionary.add_file_to_dictionary(filename, dict, tokenizer.tokenize_line, 10) dict.finalize(threshold=0, nwords=-1, padding_factor=8) for c in string.ascii_letters: count = dict.get_count(dict.index(c)) self.assertEqual( counts[c], count, f"{c} count is {count} but should be {counts[c]}")
def _lang_id(dic: Dictionary, lang: str): """Return language ID index.""" idx = dic.index(lang) assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang) return idx
def _lang_token_index(dic: Dictionary, lang: str): """Return language token index.""" idx = dic.index(_lang_token(lang)) assert idx != dic.unk_index, \ 'cannot find language token for lang {}'.format(lang) return idx
def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary): lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang)) assert lang_tag_idx != dictionary.unk() return lang_tag_idx
def _lang_token_index(dic: Dictionary, lang: str, style="__{}__"): """Return language token index.""" idx = dic.index(_lang_token(lang, style)) assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang) return idx
class XLMRobertaTokenizer(PreTrainedTokenizer): """Custom tokenizer for our custom pretrained model. You can ignore this file if you use another pretrained model. For example, if you use PhoBert, you should tokenize by using VnCoreNLP. """ def __init__(self, pretrained_file, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", **kwargs): super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) # load bpe model and vocab file sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model') vocab_file = pjoin(pretrained_file, 'dict.txt') self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load( sentencepiece_model ) # please dont use anything from sp_model bcz it makes everything goes wrong self.bpe_dict = Dictionary().load(vocab_file) # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = { "<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3 } # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 0 self.fairseq_tokens_to_ids["<mask>"] = len( self.bpe_dict) + self.fairseq_offset self.fairseq_ids_to_tokens = { v: k for k, v in self.fairseq_tokens_to_ids.items() } def _tokenize(self, text): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] spm_id = self.bpe_dict.index(token) return spm_id def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.bpe_dict[index] @property def vocab_size(self): return len( self.bpe_dict) + self.fairseq_offset + 1 # Add the <mask> token def get_vocab(self): vocab = { self.convert_ids_to_tokens(i): i for i in range(self.vocab_size) } vocab.update(self.added_tokens_encoder) return vocab