def _fill_input_queue(self): sentence_start_id = self._vocab.word_to_id(data.SENTENCE_START) sentence_end_id = self._vocab.word_to_id(data.SENTENCE_END) pad_id = self._vocab.word_to_id(data.PAD_TOKEN) input_gen = self._gen_text(data.gen_example(self._data_path)) while True: (article, abstract) = six.next(input_gen) article_sentences = [sent.strip() for sent in data.convert_paragraph_to_sentences(article, False)] abstract_sentences = [sent.strip() for sent in data.convert_paragraph_to_sentences(abstract, False)] enc_inputs = [] dec_inputs = [sentence_start_id] # use the <s> as the <GO> symbol for decoder inputs # convert first N sentences to word ids, stripping existing <s> and </s> for i in xrange(min(self._max_article_sentences, len(article_sentences))): enc_inputs += data.convert_words_to_ids(article_sentences[i], self._vocab) for i in xrange(min(self._max_abstract_sentences, len(abstract_sentences))): dec_inputs += data.convert_words_to_ids(abstract_sentences[i], self._vocab) # filter out too-short input if len(enc_inputs) < self._min_input_len or len(dec_inputs) < self._min_input_len: LOGGER.warn('drop an example - too short. enc: %d, dec: %d', len(enc_inputs), len(dec_inputs)) continue if not self._truncate_input: if len(enc_inputs) > self._enc_timesteps or len(dec_inputs) > self._dec_timesteps: LOGGER.warn('drop an example - too long. enc: %d, dec: %d', len(enc_inputs), len(dec_inputs)) continue else: if len(enc_inputs) > self._enc_timesteps: enc_inputs = enc_inputs[:self._enc_timesteps] if len(dec_inputs) > self._dec_timesteps: dec_inputs = dec_inputs[:self._dec_timesteps] # targets is dec_inputs without <s> at beginning, plus </s> at end targets = dec_inputs[1:] targets.append(sentence_end_id) enc_input_len = len(enc_inputs) dec_output_len = len(targets) # pad if necessary enc_inputs += [pad_id] * (self._enc_timesteps - len(enc_inputs)) dec_inputs += [sentence_end_id] * (self._dec_timesteps - len(dec_inputs)) targets += [sentence_end_id] * (self._dec_timesteps - len(targets)) # 'enc_input dec_input target enc_len dec_len origin_article origin_abstract' element = MODEL_INPUT( enc_input=enc_inputs, dec_input=dec_inputs, target=targets, enc_len=enc_input_len, dec_len=dec_output_len, origin_article=' '.join(article_sentences), origin_abstract=' '.join(abstract_sentences) ) self._input_queue.put(element)