Пример #1
0
 def encode(self, x: str) -> str:
     y = tokenize(x,
                  space=self.space_symbol,
                  non_lang_syms=self.non_lang_syms)
     if self.ends_with_space:
         return y + " " + self.space_symbol
     else:
         return y
Пример #2
0
    def test_speech_tokenizer(self):
        for i, sent in enumerate(self.text):
            print('test sentence {}:'.format(i))
            print(sent)
            tokens = utils.tokenize(
                sent,
                space=self.dictionary.space_word,
                non_lang_syms=self.non_lang_syms,
            )

            # test :func:`~speech_tools.utils.tokenize` with
            # :func:`~AsrDictionary.encode_line`
            tensor = self.dictionary.encode_line(
                tokens,
                add_if_not_exist=False,
                append_eos=True,
            )
            reconstructed_tokens = self.dictionary.string(tensor)
            expected_tokens = ' '.join([
                token if self.dictionary.index(token) != self.dictionary.unk()
                else self.dictionary.unk_word for token in tokens.split(' ')
            ])
            self.assertEqual(reconstructed_tokens, expected_tokens)

            # test :func:`~speech_tools.utils.tokenize` with
            # :func:`~AsrDictionary.tokens_to_sentence`
            reconstructed_sent = self.dictionary.tokens_to_sentence(tokens)
            expected_sent = []
            words = sent.split(' ')
            for w in words:
                if w not in self.non_lang_syms:
                    new_word = ''.join([
                        self.dictionary.unk_word if c in self.oovs else c
                        for c in w
                    ])
                    expected_sent.append(new_word)
                else:
                    expected_sent.append(w)
            expected_sent = ' '.join(expected_sent)
            self.assertEqual(reconstructed_sent, expected_sent)
 def tokenizer(x: str) -> List[str]:
     return tokenize(
         x, non_lang_syms=subword_dict.non_lang_syms).split(' ')
Пример #4
0
 def tokenizer(x):
     return tokenize(x, non_lang_syms=subword_dict.non_lang_syms).split(" ")