Python Tokenizer.num_words示例

编程语言: Python

命名空间/包名称: helpers.Tokenizer

类/类型: Tokenizer

方法/功能: num_words

hotexamples.com的示例: 3

Python Tokenizer.num_words - 已找到3个示例。这些是从开源项目中提取的最受好评的helpers.Tokenizer.Tokenizer.num_words现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Tokenizer(6)

texts_to_sequences(6)

num_words(3)

word_index(3)

fit_on_texts(2)

示例#1

显示文件

文件： WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py 项目： ml-nic/NMT-PA

    def predict_one_sentence(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence

示例#2

显示文件

文件： WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py 项目： ml-nic/NMT-PA

    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction

示例#3

显示文件

文件： WordBasedSeq2Seq1000Units20EpochsGLOVE_seperate_serve.py 项目： ml-nic/NMT-PA

    def predict_batch(self, sentences):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentences)
        sentences = en_tokenizer.texts_to_sequences(sentences)
        print(sentences)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print(
                            "second best prediction is ",
                            reverse_word_index[np.argmax(
                                np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo accept not multiple of batchsize
                break
        return predicted_sentences