def __init__(self, instances, options, word_vocab=None):
        self.options = options

        self.instances = instances # list of tuples
        self.batch_size = len(instances)
        self.vocab = word_vocab

        # node num
        self.node_num = [] # [batch_size]
        for (lex_idx, lex_chars_idx, in_neigh, in_neigh_hidden, in_label_idx, entity_indices, y) in instances:
            self.node_num.append(len(lex_idx))
        self.node_num = np.array(self.node_num, dtype=np.int32)

        # node char num
        if options.with_char:
            self.nodes_chars_num = [[len(lex_chars_idx) for lex_chars_idx in instance[1]] for instance in instances]
            self.nodes_chars_num = padding_utils.pad_2d_vals_no_size(self.nodes_chars_num)

        # neigh mask
        self.in_neigh_mask = [] # [batch_size, node_num, neigh_num]
        self.entity_indices_mask = []
        for instance in instances:
            ins = []
            for in_neighs in instance[2]:
                ins.append([1 for _ in in_neighs])
            self.in_neigh_mask.append(ins)
            idxs = []
            for entity_indices in instance[5]:
                idxs.append([1 for _ in entity_indices])
            self.entity_indices_mask.append(idxs)
        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(self.in_neigh_mask)
        self.entity_indices_mask = padding_utils.pad_3d_vals_no_size(self.entity_indices_mask)

        # the actual contents
        self.nodes = [x[0] for x in instances]
        if options.with_char:
            self.nodes_chars = [x[1] for x in instances] # [batch_size, sent_len, char_num]
        self.in_neigh_indices = [x[2] for x in instances]
        self.in_neigh_hidden_indices = [x[3] for x in instances]
        self.in_neigh_edges = [x[4] for x in instances]
        self.entity_indices = [x[5] for x in instances]
        self.y = [x[6] for x in instances]

        # making ndarray
        self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes)
        if options.with_char:
            self.nodes_chars = padding_utils.pad_3d_vals_no_size(self.nodes_chars)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(self.in_neigh_indices)
        self.in_neigh_hidden_indices = padding_utils.pad_3d_vals_no_size(self.in_neigh_hidden_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(self.in_neigh_edges)
        self.entity_indices = padding_utils.pad_3d_vals_no_size(self.entity_indices)
        self.y = np.asarray(self.y, dtype='int32')

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_hidden_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.entity_indices_mask.shape == self.entity_indices.shape

        assert self.entity_indices.shape[1] == options.entity_num
        assert self.entity_indices_mask.shape[1] == options.entity_num
示例#2
0
    def build_phrase_vocabs(self):
        self.phrase_vocabs = []
        word_size = self.vocab.vocab_size + 1

        self.phrase_starts = []
        self.phrase_ends = []
        self.phrase_idx = []
        self.phrase_lengths = []
        self.max_phrase_size = 0
        if self.options.with_target_lattice:
            self.target_lattices = []
        for (sent1, sent2, sent3) in self.instances:
            # collect all phrases
            if self.options.withSyntaxChunk:
                (cur_phrase_starts, cur_phrase_ends, _) = sent1.collect_all_syntax_chunks(self.options.max_chunk_len)
            else:
                (cur_phrase_starts, cur_phrase_ends) = sent1.collect_all_possible_chunks(self.options.max_chunk_len)

            # collect phrase vocab and map phrase into phrase_id
            cur_phrase2id = {}
            cur_phrase_idx = []
            for i in xrange(len(cur_phrase_starts)):
                cur_start = cur_phrase_starts[i]
                cur_end = cur_phrase_ends[i]
                cur_phrase = sent1.getTokChunk(cur_start, cur_end)
                cur_index = None
                if cur_start==cur_end:
                    cur_index = self.vocab.getIndex(cur_phrase)
                elif cur_phrase2id.has_key(cur_phrase):
                    cur_index = cur_phrase2id[cur_phrase]
                else:
                    cur_index = len(cur_phrase2id) + word_size
                    cur_phrase2id[cur_phrase] = cur_index
                cur_phrase_idx.append(cur_index)
            cur_phrase_vocab = phrase_lattice_utils.prefix_tree(cur_phrase2id)
            self.phrase_vocabs.append(cur_phrase_vocab)
            self.phrase_starts.append(cur_phrase_starts)
            self.phrase_ends.append(cur_phrase_ends)
            self.phrase_idx.append(cur_phrase_idx)
            self.phrase_lengths.append(len(cur_phrase_starts))
            cur_phrase_size = len(cur_phrase2id)
            if self.max_phrase_size<cur_phrase_size: self.max_phrase_size = cur_phrase_size

            if self.options.with_target_lattice:
                cur_lattice = phrase_lattice_utils.phrase_lattice(sent2.words, word_vocab=self.vocab, prefix_tree=cur_phrase_vocab)
                self.target_lattices.append(cur_lattice)

        self.phrase_starts = padding_utils.pad_2d_vals_no_size(self.phrase_starts) # [batch_size, phrase_size]
        self.phrase_ends = padding_utils.pad_2d_vals_no_size(self.phrase_ends) # [batch_size, phrase_size]
        self.phrase_idx = padding_utils.pad_2d_vals_no_size(self.phrase_idx) # [batch_size, phrase_size]
        self.phrase_lengths = np.array(self.phrase_lengths, dtype=np.int32) # [batch_size]
    def __init__(self, ori_batch):
        self.options = ori_batch.options
        self.amr_node = ori_batch.amr_node
        self.id = ori_batch.id
        self.target_ref = ori_batch.target_ref
        self.batch_size = ori_batch.batch_size
        self.vocab = ori_batch.vocab

        self.node_num = np.array(ori_batch.node_num, dtype=np.int32)
        self.sent_len = np.array(ori_batch.sent_len, dtype=np.int32)
        self.sent_pos_len = np.array(ori_batch.sent_pos_len, dtype=np.int32)

        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_mask)
        self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_mask)

        # making ndarray
        self.nodes = padding_utils.pad_2d_vals_no_size(ori_batch.nodes)
        if self.options.with_char:
            self.nodes_chars = padding_utils.pad_3d_vals_no_size(ori_batch.nodes_chars)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_edges)
        self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_indices)
        self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_edges)

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.out_neigh_mask.shape == self.out_neigh_indices.shape
        assert self.out_neigh_mask.shape == self.out_neigh_edges.shape

        # [batch_size, sent_len_max]
        self.sent_inp = padding_utils.pad_2d_vals(ori_batch.sent_inp, len(ori_batch.sent_inp), self.options.max_answer_len)
        self.sent_out = padding_utils.pad_2d_vals(ori_batch.sent_out, len(ori_batch.sent_out), self.options.max_answer_len)
        self.sent_pos_inp = padding_utils.pad_2d_vals(ori_batch.sent_pos_inp, len(ori_batch.sent_pos_inp), self.options.max_answer_len)
        self.sent_pos_out = padding_utils.pad_2d_vals(ori_batch.sent_pos_out, len(ori_batch.sent_pos_out), self.options.max_answer_len)
示例#4
0
    def __init__(self, ori_batch):
        self.batch_size = ori_batch.batch_size
        self.options = ori_batch.options
        self.vocab = ori_batch.vocab
        self.char_vocab = ori_batch.char_vocab

        self.ids = ori_batch.ids
        self.candidates_str = ori_batch.candidates_str
        self.ref_str = ori_batch.ref_str

        # making ndarray
        self.question = padding_utils.pad_2d_vals_no_size(ori_batch.question)
        self.question_len = np.array(ori_batch.question_len, dtype=np.int32)
        self.passage = padding_utils.pad_2d_vals_no_size(ori_batch.passage)
        self.passage_len = np.array(ori_batch.passage_len, dtype=np.int32)
        self.entity_start = padding_utils.pad_2d_vals_no_size(
            ori_batch.entity_start)
        self.entity_end = padding_utils.pad_2d_vals_no_size(
            ori_batch.entity_end)
        self.entity_len = np.array(ori_batch.entity_len, dtype=np.int32)
        if self.options.with_grn or self.options.with_gcn:
            self.entity_edges = padding_utils.pad_3d_vals_no_size(
                ori_batch.entity_edges)
            self.entity_edges_mask = padding_utils.pad_3d_vals_no_size(
                ori_batch.entity_edges_mask, dtype=np.float32)
        else:
            self.entity_edges = None
            self.entity_edges_mask = None
        self.cands = padding_utils.pad_3d_vals_no_size(ori_batch.cands)
        self.cands_len = np.array(ori_batch.cands_len, dtype=np.int32)
        self.cands_occur_mask = padding_utils.pad_3d_vals_no_size(
            ori_batch.cands_occur_mask, dtype=np.float32)
        if ori_batch.ref != None:
            self.ref = np.array(ori_batch.ref, dtype=np.int32)
        else:
            self.ref = None
        if self.options.with_char:
            self.question_chars = padding_utils.pad_3d_vals_no_size(
                ori_batch.question_chars)
            self.question_chars_num = padding_utils.pad_2d_vals_no_size(
                ori_batch.question_chars_num)
            self.passage_chars = padding_utils.pad_3d_vals_no_size(
                ori_batch.passage_chars)
            self.passage_chars_num = padding_utils.pad_2d_vals_no_size(
                ori_batch.passage_chars_num)
示例#5
0
    def __init__(self,
                 instances,
                 options,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None):
        self.options = options

        self.instances = instances
        self.batch_size = len(instances)
        self.vocab = word_vocab

        self.passage_words = [
            instances[i][0].tokText.split() for i in range(self.batch_size)
        ]

        self.has_sent3 = False
        if instances[0][2] is not None: self.has_sent3 = True

        # create length
        self.sent1_length = []  # [batch_size]
        self.sent2_length = []  # [batch_size]
        if self.has_sent3: self.sent3_length = []  # [batch_size]
        for (sent1, sent2, sent3) in instances:
            self.sent1_length.append(sent1.get_length())
            self.sent2_length.append(sent2.get_length())
            if self.has_sent3: self.sent3_length.append(sent3.get_length())
        self.sent1_length = np.array(self.sent1_length, dtype=np.int32)
        self.sent2_length = np.array(self.sent2_length, dtype=np.int32)
        if self.has_sent3:
            self.sent3_length = np.array(self.sent3_length, dtype=np.int32)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')
        if options.with_word:
            self.sent1_word = []  # [batch_size, sent1_len]
            self.sent2_word = []  # [batch_size, sent2_len]
            self.sent2_input_word = []
            if self.has_sent3: self.sent3_word = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_word.append(sent1.word_idx_seq)
                self.sent2_word.append(sent2.word_idx_seq)
                self.sent2_input_word.append([start_id] +
                                             sent2.word_idx_seq[:-1])
                if self.has_sent3: self.sent3_word.append(sent3.word_idx_seq)
            self.sent1_word = padding_utils.pad_2d_vals_no_size(
                self.sent1_word)
            self.sent2_word = padding_utils.pad_2d_vals(
                self.sent2_word, len(self.sent2_word), options.max_answer_len)
            self.sent2_input_word = padding_utils.pad_2d_vals(
                self.sent2_input_word, len(self.sent2_input_word),
                options.max_answer_len)
            if self.has_sent3:
                self.sent3_word = padding_utils.pad_2d_vals_no_size(
                    self.sent3_word)

            self.in_answer_words = self.sent2_word
            self.gen_input_words = self.sent2_input_word
            self.answer_lengths = self.sent2_length

        if options.with_char:
            self.sent1_char = []  # [batch_size, sent1_len]
            self.sent2_char = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_char = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_char.append(sent1.char_idx_seq)
                self.sent2_char.append(sent2.char_idx_seq)
                if self.has_sent3: self.sent3_char.append(sent3.char_idx_seq)
            self.sent1_char = padding_utils.pad_3d_vals_no_size(
                self.sent1_char)
            self.sent2_char = padding_utils.pad_3d_vals_no_size(
                self.sent2_char)
            if self.has_sent3:
                self.sent3_char = padding_utils.pad_3d_vals_no_size(
                    self.sent3_char)

        if options.with_POS:
            self.sent1_POS = []  # [batch_size, sent1_len]
            self.sent2_POS = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_POS = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_POS.append(sent1.POS_idx_seq)
                self.sent2_POS.append(sent2.POS_idx_seq)
                if self.has_sent3: self.sent3_POS.append(sent3.POS_idx_seq)
            self.sent1_POS = padding_utils.pad_2d_vals_no_size(self.sent1_POS)
            self.sent2_POS = padding_utils.pad_2d_vals_no_size(self.sent2_POS)
            if self.has_sent3:
                self.sent3_POS = padding_utils.pad_2d_vals_no_size(
                    self.sent3_POS)

        if options.with_NER:
            self.sent1_NER = []  # [batch_size, sent1_len]
            self.sent2_NER = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_NER = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_NER.append(sent1.NER_idx_seq)
                self.sent2_NER.append(sent2.NER_idx_seq)
                if self.has_sent3: self.sent3_NER.append(sent3.NER_idx_seq)
            self.sent1_NER = padding_utils.pad_2d_vals_no_size(self.sent1_NER)
            self.sent2_NER = padding_utils.pad_2d_vals_no_size(self.sent2_NER)
            if self.has_sent3:
                self.sent3_NER = padding_utils.pad_2d_vals_no_size(
                    self.sent3_NER)

        if options.with_phrase_projection:
            self.build_phrase_vocabs()
            if options.pretrain_with_max_matching and options.with_target_lattice:
                (_, prediction_lengths, generator_input_idx,
                 generator_output_idx) = self.sample_a_partition(
                     max_matching=True)
                self.in_answer_words = generator_output_idx
                self.gen_input_words = generator_input_idx
                self.answer_lengths = prediction_lengths
示例#6
0
    def __init__(self, instances, options, word_vocab=None):
        self.options = options

        self.amr_node = [x[0] for x in instances]
        self.id = [x[-1] for x in instances]
        self.target_ref = [x[-2] for x in instances]  # list of tuples
        self.batch_size = len(instances)
        self.vocab = word_vocab

        # create length
        self.node_num = []  # [batch_size]
        self.sent_len = []  # [batch_size]
        for (nodes_idx, nodes_chars_idx, in_neigh_indices, in_neigh_edges_idx,
             out_neigh_indices, out_neigh_edges_idx, sentence_idx, sentence,
             id) in instances:
            self.node_num.append(len(nodes_idx))
            self.sent_len.append(
                min(len(sentence_idx) + 1, options.max_answer_len))
        self.node_num = np.array(self.node_num, dtype=np.int32)
        self.sent_len = np.array(self.sent_len, dtype=np.int32)

        # node char num
        if options.with_char:
            self.nodes_chars_num = [[
                len(nodes_chars_idx) for nodes_chars_idx in instance[1]
            ] for instance in instances]
            self.nodes_chars_num = padding_utils.pad_2d_vals_no_size(
                self.nodes_chars_num)

        # neigh mask
        self.in_neigh_mask = []  # [batch_size, node_num, neigh_num]
        self.out_neigh_mask = []
        for instance in instances:
            ins = []
            for in_neighs in instance[2]:
                ins.append([1 for _ in in_neighs])
            self.in_neigh_mask.append(ins)
            outs = []
            for out_neighs in instance[4]:
                outs.append([1 for _ in out_neighs])
            self.out_neigh_mask.append(outs)
        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_mask)
        self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_mask)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')

        self.nodes = [x[0] for x in instances]
        if options.with_char:
            self.nodes_chars = [inst[1] for inst in instances
                                ]  # [batch_size, sent_len, char_num]
        self.in_neigh_indices = [x[2] for x in instances]
        self.in_neigh_edges = [x[3] for x in instances]
        self.out_neigh_indices = [x[4] for x in instances]
        self.out_neigh_edges = [x[5] for x in instances]

        self.sent_inp = []
        self.sent_out = []
        for _, _, _, _, _, _, sentence_idx, sentence, id in instances:
            if len(sentence_idx) < options.max_answer_len:
                self.sent_inp.append([
                    start_id,
                ] + sentence_idx)
                self.sent_out.append(sentence_idx + [
                    end_id,
                ])
            else:
                self.sent_inp.append([
                    start_id,
                ] + sentence_idx[:-1])
                self.sent_out.append(sentence_idx)

        # making ndarray
        self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes)
        if options.with_char:
            self.nodes_chars = padding_utils.pad_3d_vals_no_size(
                self.nodes_chars)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_edges)
        self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_indices)
        self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_edges)

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.out_neigh_mask.shape == self.out_neigh_indices.shape
        assert self.out_neigh_mask.shape == self.out_neigh_edges.shape

        # [batch_size, sent_len_max]
        self.sent_inp = padding_utils.pad_2d_vals(self.sent_inp,
                                                  len(self.sent_inp),
                                                  options.max_answer_len)
        self.sent_out = padding_utils.pad_2d_vals(self.sent_out,
                                                  len(self.sent_out),
                                                  options.max_answer_len)
示例#7
0
    def __init__(self, instances, options, word_vocab=None):
        self.options = options

        self.instances = instances  # list of tuples
        self.batch_size = len(instances)
        self.vocab = word_vocab

        # sentence length
        self.sentence_lengths = []  # [batch_size]
        for inst in instances:
            self.sentence_lengths.append(len(inst[0]))
        self.sentence_lengths = np.array(self.sentence_lengths, dtype=np.int32)

        # sentence char length
        if options.with_char:
            self.sentence_chars_lengths = [[
                len(toks_chars_idx) for toks_chars_idx in inst[1]
            ] for inst in instances]
            self.sentence_chars_lengths = padding_utils.pad_2d_vals_no_size(
                self.sentence_chars_lengths)


#(0-toks_idx, 1-toks_chars_idx, 2-poses_idx, 3-nes, 4-entity_indices,
#                5-in_neigh, 6-in_label_idx, 7-in_prob, 8-out_neigh, 9-out_label_idx, 10-out_prob, 11-ref, 12-id)
# neigh mask
        self.in_neigh_mask = []  # [batch_size, sentence_num, neigh_num]
        self.out_neigh_mask = []  # [batch_size, sentence_num, neigh_num]
        self.entity_indices_mask = []  # [batch_size, 2, indices]
        for inst in instances:
            eee = [[1 for x in entity] for entity in inst[4]]
            self.entity_indices_mask.append(eee)
            iii = [[1 for x in in_neigh] for in_neigh in inst[5]]
            self.in_neigh_mask.append(iii)
            ooo = [[1 for x in out_neigh] for out_neigh in inst[8]]
            self.out_neigh_mask.append(ooo)
        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_mask)
        self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_mask)
        self.entity_indices_mask = padding_utils.pad_3d_vals_no_size(
            self.entity_indices_mask)

        # the actual contents
        self.sentence_words = [x[0] for x in instances]
        if options.with_char:
            self.sentence_chars = [x[1] for x in instances
                                   ]  # [batch_size, sent_len, char_num]
        if options.with_POS:
            self.sentence_POSs = [x[2] for x in instances]
        self.nes = [x[3] for x in instances]  # [batch_size, sent_len]
        self.entity_indices = [x[4]
                               for x in instances]  # [batch_size, 2, indices]
        self.in_neigh_indices = [x[5] for x in instances]
        self.in_neigh_edges = [x[6] for x in instances]
        self.out_neigh_indices = [x[8] for x in instances]
        self.out_neigh_edges = [x[9] for x in instances]
        if instances[0][7] != None:
            self.in_neigh_prob = [x[7] for x in instances]
            self.out_neigh_prob = [x[10] for x in instances]
        self.refs = [x[11] for x in instances]
        self.ids = [x[12] for x in instances]

        # making ndarray
        self.sentence_words = padding_utils.pad_2d_vals_no_size(
            self.sentence_words)
        if options.with_char:
            self.sentence_chars = padding_utils.pad_3d_vals_no_size(
                self.sentence_chars)
        if options.with_POS:
            self.sentence_POSs = padding_utils.pad_2d_vals_no_size(
                self.sentence_POSs)
        self.nes = padding_utils.pad_2d_vals_no_size(self.nes)
        self.entity_indices = padding_utils.pad_3d_vals_no_size(
            self.entity_indices)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_edges)
        self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_indices)
        self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_edges)
        if instances[0][7] != None:
            self.in_neigh_prob = padding_utils.pad_3d_vals_no_size(
                self.in_neigh_prob)
            self.out_neigh_prob = padding_utils.pad_3d_vals_no_size(
                self.out_neigh_prob)
        self.refs = np.asarray(self.refs, dtype='int32')

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.out_neigh_mask.shape == self.out_neigh_indices.shape
        assert self.out_neigh_mask.shape == self.out_neigh_edges.shape
    def __init__(self,
                 instances,
                 options,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None):
        self.options = options

        self.batch_size = len(instances)
        self.vocab = word_vocab

        self.id = [inst[-1] for inst in instances]
        self.source = [inst[-3] for inst in instances]
        self.target_ref = [inst[-2] for inst in instances]

        # create length
        self.sent1_length = []  # [batch_size]
        self.sent2_length = []  # [batch_size]
        for (sent1_idx, sent2_idx, _, _, _) in instances:
            self.sent1_length.append(len(sent1_idx))
            self.sent2_length.append(
                min(len(sent2_idx) + 1, options.max_answer_len))
        self.sent1_length = np.array(self.sent1_length, dtype=np.int32)
        self.sent2_length = np.array(self.sent2_length, dtype=np.int32)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')
        if options.with_word:
            self.sent1_word = []  # [batch_size, sent1_len]
            self.sent2_word = []  # [batch_size, sent2_len]
            self.sent2_input_word = []
            for (sent1_idx, sent2_idx, _, _, _) in instances:
                self.sent1_word.append(sent1_idx)
                self.sent2_word.append(sent2_idx + [end_id])
                self.sent2_input_word.append([start_id] + sent2_idx)
            self.sent1_word = padding_utils.pad_2d_vals(
                self.sent1_word, len(instances), np.max(self.sent1_length))
            self.sent2_word = padding_utils.pad_2d_vals(
                self.sent2_word, len(instances), options.max_answer_len)
            self.sent2_input_word = padding_utils.pad_2d_vals(
                self.sent2_input_word, len(instances), options.max_answer_len)

            self.in_answer_words = self.sent2_word
            self.gen_input_words = self.sent2_input_word
            self.answer_lengths = self.sent2_length

        if options.with_char:
            self.sent1_char = []  # [batch_size, sent1_len]
            self.sent1_char_lengths = []
            for (_, _, sent1, sent2, _) in instances:
                sent1_char_idx = char_vocab.to_character_matrix_for_list(
                    sent1.split()[:options.max_passage_len])
                self.sent1_char.append(sent1_char_idx)
                self.sent1_char_lengths.append(
                    [len(x) for x in sent1_char_idx])
            self.sent1_char = padding_utils.pad_3d_vals_no_size(
                self.sent1_char)
            self.sent1_char_lengths = padding_utils.pad_2d_vals_no_size(
                self.sent1_char_lengths)
    def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None):
        self.options = options

        self.instances = instances
        self.batch_size = len(instances)
        self.action_vocab = action_vocab
        self.feat_vocab = feat_vocab  # Added the feature indexer as batch attributes.

        # create length
        self.input_length = [] # [batch_size]
        self.concept_length = [] # [batch_size]
        self.action_length = [] # [batch_size]
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.input_length.append(input_sent.get_length()+1)
            self.concept_length.append(len(concepts_idx)+1)
            self.action_length.append(min(options.max_answer_len,len(actions_idx)))
        self.input_length = np.array(self.input_length, dtype=np.int32)
        self.concept_length = np.array(self.concept_length, dtype=np.int32)
        self.action_length = np.array(self.action_length, dtype=np.int32)

        start_id = action_vocab.getIndex('<s>')
        self.action_inp = []
        self.action_ref = []
        self.feats = []
        self.action2cid = []
        self.action2wid = []
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.action_inp.append([start_id,]+actions_idx[:-1])
            self.action_ref.append(actions_idx)
            self.feats.append(feats_idx)
            self.action2cid.append(action2cid)
            self.action2wid.append(action2wid)
        self.action_inp = padding_utils.pad_2d_vals(self.action_inp, len(self.action_inp), options.max_answer_len)
        self.action_ref = padding_utils.pad_2d_vals(self.action_ref, len(self.action_ref), options.max_answer_len)
        self.feats = padding_utils.pad_3d_vals(self.feats, len(self.feats), options.max_answer_len, len(self.feats[0][0]))
        self.action2cid = padding_utils.pad_2d_vals(self.action2cid, len(self.action2cid), options.max_answer_len)
        self.action2wid = padding_utils.pad_2d_vals(self.action2wid, len(self.action2wid), options.max_answer_len)

        append_id = word_vocab.getIndex('-NULL-')
        self.input_word = [] # [batch_size, sent_len]
        self.concept_word = [] # [batch_size, sent_len]
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.input_word.append(input_sent.word_idx_seq+[append_id,])
            self.concept_word.append(concepts_idx+[append_id,])
        self.input_word = padding_utils.pad_2d_vals_no_size(self.input_word)
        self.concept_word = padding_utils.pad_2d_vals_no_size(self.concept_word)

        if options.with_lemma:
            self.input_lemma = []
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_lemma.append(input_sent.lemma_idx_seq+[append_id,])
            self.input_lemma = padding_utils.pad_2d_vals_no_size(self.input_lemma)

        if options.with_char:
            assert False
            self.input_char = [] # [batch_size, sent_len, char_size]
            self.input_char_len = [] # [batch_size, sent_len]
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_char.append(input_sent.char_idx_matrix)
                self.input_char_len.append([len(x) for x in input_sent.tok])
            self.input_char = padding_utils.pad_3d_vals_no_size(self.input_char)
            self.input_char_len = padding_utils.pad_2d_vals_no_size(self.input_char_len)

        if options.with_POS:
            append_pos_id = POS_vocab.getIndex('-NULL-')
            self.input_POS = [] # [batch_size, sent1_len]
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_POS.append(input_sent.POS_idx_seq+[append_pos_id,])
            self.input_POS = padding_utils.pad_2d_vals_no_size(self.input_POS)