示例#1
0
    def _fill_input_queue(self):
        sentence_start_id = self._vocab.word_to_id(data.SENTENCE_START)
        sentence_end_id = self._vocab.word_to_id(data.SENTENCE_END)
        pad_id = self._vocab.word_to_id(data.PAD_TOKEN)

        input_gen = self._gen_text(data.gen_example(self._data_path))

        while True:
            (article, abstract) = six.next(input_gen)
            article_sentences = [sent.strip() for sent in data.convert_paragraph_to_sentences(article, False)]
            abstract_sentences = [sent.strip() for sent in data.convert_paragraph_to_sentences(abstract, False)]

            enc_inputs = []
            dec_inputs = [sentence_start_id]  # use the <s> as the <GO> symbol for decoder inputs

            # convert first N sentences to word ids, stripping existing <s> and </s>
            for i in xrange(min(self._max_article_sentences, len(article_sentences))):
                enc_inputs += data.convert_words_to_ids(article_sentences[i], self._vocab)

            for i in xrange(min(self._max_abstract_sentences, len(abstract_sentences))):
                dec_inputs += data.convert_words_to_ids(abstract_sentences[i], self._vocab)

            # filter out too-short input
            if len(enc_inputs) < self._min_input_len or len(dec_inputs) < self._min_input_len:
                LOGGER.warn('drop an example - too short. enc: %d, dec: %d', len(enc_inputs), len(dec_inputs))
                continue

            if not self._truncate_input:
                if len(enc_inputs) > self._enc_timesteps or len(dec_inputs) > self._dec_timesteps:
                    LOGGER.warn('drop an example - too long. enc: %d, dec: %d', len(enc_inputs), len(dec_inputs))
                    continue
            else:
                if len(enc_inputs) > self._enc_timesteps:
                    enc_inputs = enc_inputs[:self._enc_timesteps]

                if len(dec_inputs) > self._dec_timesteps:
                    dec_inputs = dec_inputs[:self._dec_timesteps]

            # targets is dec_inputs without <s> at beginning, plus </s> at end
            targets = dec_inputs[1:]
            targets.append(sentence_end_id)

            enc_input_len = len(enc_inputs)
            dec_output_len = len(targets)

            # pad if necessary
            enc_inputs += [pad_id] * (self._enc_timesteps - len(enc_inputs))
            dec_inputs += [sentence_end_id] * (self._dec_timesteps - len(dec_inputs))
            targets += [sentence_end_id] * (self._dec_timesteps - len(targets))

            # 'enc_input dec_input target enc_len dec_len origin_article origin_abstract'
            element = MODEL_INPUT(
                enc_input=enc_inputs,
                dec_input=dec_inputs,
                target=targets,
                enc_len=enc_input_len,
                dec_len=dec_output_len,
                origin_article=' '.join(article_sentences),
                origin_abstract=' '.join(abstract_sentences)
            )

            self._input_queue.put(element)