Python Dictionary.index примеры использования

Язык программирования: Python

Пространство имен/Пакет: fairseq.data

Класс/Тип: Dictionary

Метод/Функция: index

Примеров на hotexamples.com: 13

Python Dictionary.index - 13 примеров найдено. Это лучшие примеры Python кода для fairseq.data.Dictionary.index, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Dictionary(30)

add_from_file(30)

add_symbol(30)

load(30)

encode_line(17)

finalize(15)

index(13)

add_file_to_dictionary(9)

pad(7)

string(7)

eos(5)

save(3)

pad_to_multiple_(3)

eos_index(2)

pad_index(2)

symbols(1)

get_count(1)

nspecial(1)

load_from_json(1)

indices(1)

__len__(1)

count(1)

bos_index(1)

bos(1)

unk(1)

Пример #1

Показать файл

 def test_space(self):
     # for example, character models treat space as a symbol
     dict_file = io.StringIO("  999\n" "a 999\n" "b 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index(" "), 4)
     self.assertEqual(d.index("a"), 5)
     self.assertEqual(d.index("b"), 6)

Пример #2

Показать файл

    def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64),
                                                  (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents),
                                 max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None

Пример #3

Показать файл

Файл: test_noising.py Проект: hadyelsahar/fairseq

    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])

Пример #4

Показать файл

    def _get_test_data(self, append_eos=True):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor(src_len)

Пример #5

Показать файл

Файл: tasks.py Проект: nilesh-c/kgqa

    def setup_task(cls, args, **kwargs):
        # Here we can perform any setup required for the task. This may include
        # loading Dictionaries, initializing shared Embedding layers, etc.
        # In this case we'll just load the Dictionaries.
        reloaded = torch.load(args.xlmr_model_dict)
        params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        input_vocab = Dictionary(reloaded['dico_id2word'],
                                 reloaded['dico_word2id'],
                                 reloaded['dico_counts'])
        params.n_words = len(input_vocab)
        params.bos_index = input_vocab.index(BOS_WORD)
        params.eos_index = input_vocab.index(EOS_WORD)
        params.pad_index = input_vocab.index(PAD_WORD)
        params.unk_index = input_vocab.index(UNK_WORD)
        params.mask_index = input_vocab.index(MASK_WORD)

        label_vocab = Dictionary.load(os.path.join(args.data,
                                                   'dict.label.txt'))
        print('| [input] dictionary: {} types'.format(len(input_vocab)))
        print('| [label] dictionary: {} types'.format(len(label_vocab)))

        return SemparseSeq2SeqTask(args, input_vocab, label_vocab)

Пример #6

Показать файл

Файл: test_noising.py Проект: Silent-Zebra/reproduce

    def _convert_src_tokens_to_tensor(
        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
    ):
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return x, torch.LongTensor(src_len)

Пример #7

Показать файл

 def test_overwrite(self):
     # for example, Camembert overwrites <unk>, <s> and </s>
     dict_file = io.StringIO("<unk> 999 #fairseq:overwrite\n"
                             "<s> 999 #fairseq:overwrite\n"
                             "</s> 999 #fairseq:overwrite\n"
                             ", 999\n"
                             "▁de 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index("<pad>"), 1)
     self.assertEqual(d.index("foo"), 3)
     self.assertEqual(d.index("<unk>"), 4)
     self.assertEqual(d.index("<s>"), 5)
     self.assertEqual(d.index("</s>"), 6)
     self.assertEqual(d.index(","), 7)
     self.assertEqual(d.index("▁de"), 8)

Пример #8

Показать файл

    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")

Пример #9

Показать файл

Файл: multilingual_data_manager.py Проект: zentim/fairseq

def _lang_id(dic: Dictionary, lang: str):
    """Return language ID index."""
    idx = dic.index(lang)
    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
    return idx

Пример #10

Показать файл

def _lang_token_index(dic: Dictionary, lang: str):
    """Return language token index."""
    idx = dic.index(_lang_token(lang))
    assert idx != dic.unk_index, \
        'cannot find language token for lang {}'.format(lang)
    return idx

Пример #11

Показать файл

Файл: speech_to_text_dataset.py Проект: ishine/fairseq

 def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
     lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
     assert lang_tag_idx != dictionary.unk()
     return lang_tag_idx

Пример #12

Показать файл

def _lang_token_index(dic: Dictionary, lang: str, style="__{}__"):
    """Return language token index."""
    idx = dic.index(_lang_token(lang, style))
    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
    return idx

Пример #13

Показать файл

class XLMRobertaTokenizer(PreTrainedTokenizer):
    """Custom tokenizer for our custom pretrained model. 
    You can ignore this file if you use another pretrained model. For example, if you use PhoBert, you should tokenize by using VnCoreNLP.
    """
    def __init__(self,
                 pretrained_file,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model
        )  # please dont use anything from sp_model bcz it makes everything goes wrong

        self.bpe_dict = Dictionary().load(vocab_file)

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {
            "<s>": 0,
            "<pad>": 1,
            "</s>": 2,
            "<unk>": 3
        }

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0

        self.fairseq_tokens_to_ids["<mask>"] = len(
            self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }

    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        return len(
            self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab