def sentence_to_ids(phoneme, target_dict): """ :param phoneme: input phonemes :param target_dict: target dict """ if phoneme is None or len(phoneme) == 0: return None tokens = phoneme.split(" ") ids = [target_dict.get(token, target_dict[UNK]) for token in tokens] return process_batch_data([ids], [[]], target_dict)
def sentence_to_ids(sentence, target_dict): """ :param sentence: input sentence :param target_dict: target dict """ if sentence is None or len(sentence) == 0: return None sentence = cleanup_sentence(sentence, only_alphanumeric=False) tokens = word_tokenize(sentence) ids = [target_dict.get(token, target_dict[UNK]) for token in tokens] return process_batch_data([ids], [[]], target_dict)
def sentence_to_ids(sentence, source_dict, language, lower, keep_number): """ :param sentence: input sentence :param source_dict: source dict :param language: source language :param lower: boolean, convert sentence to lowercase or not :param keep_number: boolean, whether to keep the number characters """ if sentence is None or len(sentence) == 0: return None sentence = cleanup_sentence(sentence, language, lower, keep_number) tokens = sentence.split() ids = [source_dict.get(token, source_dict[UNK]) for token in tokens] return process_batch_data([ids], [[]], source_dict)
def words_to_indices(self, words): """ Convert input words into batchnized word/chars indices for inference :param words: input words :return: batchnized word indices """ chars_idx = [] for word in words: chars = [ self.char_dict[char] if char in self.char_dict else self.char_dict[UNK] for char in word ] chars_idx.append(chars) words = [word_convert(word) for word in words] words_idx = [ self.word_dict[word] if word in self.word_dict else self.word_dict[UNK] for word in words ] return process_batch_data([words_idx], [chars_idx])