Пример #1
0
 def test_space(self):
     # for example, character models treat space as a symbol
     dict_file = io.StringIO("  999\n" "a 999\n" "b 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index(" "), 4)
     self.assertEqual(d.index("a"), 5)
     self.assertEqual(d.index("b"), 6)
Пример #2
0
    def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64),
                                                  (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents),
                                 max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None
Пример #3
0
    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])
Пример #4
0
    def _get_test_data(self, append_eos=True):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor(src_len)
Пример #5
0
    def setup_task(cls, args, **kwargs):
        # Here we can perform any setup required for the task. This may include
        # loading Dictionaries, initializing shared Embedding layers, etc.
        # In this case we'll just load the Dictionaries.
        reloaded = torch.load(args.xlmr_model_dict)
        params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        input_vocab = Dictionary(reloaded['dico_id2word'],
                                 reloaded['dico_word2id'],
                                 reloaded['dico_counts'])
        params.n_words = len(input_vocab)
        params.bos_index = input_vocab.index(BOS_WORD)
        params.eos_index = input_vocab.index(EOS_WORD)
        params.pad_index = input_vocab.index(PAD_WORD)
        params.unk_index = input_vocab.index(UNK_WORD)
        params.mask_index = input_vocab.index(MASK_WORD)

        label_vocab = Dictionary.load(os.path.join(args.data,
                                                   'dict.label.txt'))
        print('| [input] dictionary: {} types'.format(len(input_vocab)))
        print('| [label] dictionary: {} types'.format(len(label_vocab)))

        return SemparseSeq2SeqTask(args, input_vocab, label_vocab)
Пример #6
0
    def _convert_src_tokens_to_tensor(
        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
    ):
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return x, torch.LongTensor(src_len)
Пример #7
0
 def test_overwrite(self):
     # for example, Camembert overwrites <unk>, <s> and </s>
     dict_file = io.StringIO("<unk> 999 #fairseq:overwrite\n"
                             "<s> 999 #fairseq:overwrite\n"
                             "</s> 999 #fairseq:overwrite\n"
                             ", 999\n"
                             "▁de 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index("<pad>"), 1)
     self.assertEqual(d.index("foo"), 3)
     self.assertEqual(d.index("<unk>"), 4)
     self.assertEqual(d.index("<s>"), 5)
     self.assertEqual(d.index("</s>"), 6)
     self.assertEqual(d.index(","), 7)
     self.assertEqual(d.index("▁de"), 8)
Пример #8
0
    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")
Пример #9
0
def _lang_id(dic: Dictionary, lang: str):
    """Return language ID index."""
    idx = dic.index(lang)
    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
    return idx
Пример #10
0
def _lang_token_index(dic: Dictionary, lang: str):
    """Return language token index."""
    idx = dic.index(_lang_token(lang))
    assert idx != dic.unk_index, \
        'cannot find language token for lang {}'.format(lang)
    return idx
Пример #11
0
 def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
     lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
     assert lang_tag_idx != dictionary.unk()
     return lang_tag_idx
Пример #12
0
def _lang_token_index(dic: Dictionary, lang: str, style="__{}__"):
    """Return language token index."""
    idx = dic.index(_lang_token(lang, style))
    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
    return idx
Пример #13
0
class XLMRobertaTokenizer(PreTrainedTokenizer):
    """Custom tokenizer for our custom pretrained model. 
    You can ignore this file if you use another pretrained model. For example, if you use PhoBert, you should tokenize by using VnCoreNLP.
    """
    def __init__(self,
                 pretrained_file,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model
        )  # please dont use anything from sp_model bcz it makes everything goes wrong

        self.bpe_dict = Dictionary().load(vocab_file)

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {
            "<s>": 0,
            "<pad>": 1,
            "</s>": 2,
            "<unk>": 3
        }

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0

        self.fairseq_tokens_to_ids["<mask>"] = len(
            self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }

    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        return len(
            self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab