def batch_generator(): todo = list(i for i in range(len(q_ids)) if is_eval or i not in not_allowed) if not is_eval: self._rng.shuffle(todo) while todo: support_lengths = list() question_lengths = list() wiq = list() spans = list() span2question = [] offsets = [] at_spans = [] unique_words, unique_word_lengths, question2unique, support2unique = \ unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab, todo[:self.batch_size]) # we have to create batches here and cannot precompute them because of the batch-specific wiq feature for i, j in enumerate(todo[:self.batch_size]): support = s_ids[j] for k in range(len(support)): emb_supports[i, k] = self._get_emb(support[k]) question = q_ids[j] for k in range(len(question)): emb_questions[i, k] = self._get_emb(question[k]) support_lengths.append(s_lengths[j]) question_lengths.append(q_lengths[j]) aps = [s for s in answer_spans[j] if s[1] - s[0] <= _max_span_size or is_eval] spans.extend(aps) span2question.extend(i for _ in aps) wiq.append(word_in_question[j]) offsets.append(token_offsets[j]) at_spans.append(answertype_spans[j]) batch_size = len(question_lengths) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: emb_supports[:batch_size, :max(support_lengths), :], XQAPorts.support_length: support_lengths, XQAPorts.emb_question: emb_questions[:batch_size, :max(question_lengths), :], XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.answer_span: spans, XQAPorts.correct_start_training: [] if is_eval else [s[0] for s in spans], XQAPorts.answer2question: span2question, XQAPorts.answer2question_training: [] if is_eval else span2question, XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout, XQAPorts.is_eval: is_eval, XQAPorts.token_char_offsets: offsets, CBOWXqaPorts.answer_type_span: at_spans } # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets]) todo = todo[self.batch_size:] yield batch
def __call__( self, qa_settings: List[QASetting]) -> Mapping[TensorPort, np.ndarray]: q_tokenized, q_ids, q_lengths, s_tokenized, s_ids, s_lengths, \ word_in_question, token_offsets, answer_spans,slot= prepare_data(qa_settings, self.vocab, self.config.get("lowercase", False), with_answers=False) unique_words, unique_word_lengths, question2unique, support2unique = \ unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab) batch_size = len(qa_settings) emb_supports = np.zeros( [batch_size, max(s_lengths), self.emb_matrix.shape[1]]) emb_questions = np.zeros( [batch_size, max(q_lengths), self.emb_matrix.shape[1]]) for i, q in enumerate(q_ids): for k, v in enumerate(s_ids[i]): emb_supports[i, k] = self._get_emb(v) for k, v in enumerate(q): emb_questions[i, k] = self._get_emb(v) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: emb_supports, XQAPorts.support_length: s_lengths, XQAPorts.emb_question: emb_questions, XQAPorts.question_length: q_lengths, XQAPorts.slot_list: slot, XQAPorts.word_in_question: word_in_question, XQAPorts.token_char_offsets: token_offsets, Ports.Input.question: q_ids } output = numpify(output, keys=[ XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets, XQAPorts.slot_list, Ports.Input.question ]) return output