Пример #1
0
    def _arg_closest_related_questions(self, question, related_questions):
        all_question = [question] + related_questions
        q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in all_question])

        padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids,
                                                              pad_tok=0)
        padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids,
                                                          pad_tok=0,
                                                          nlevels=2)

        feed_dict = {
            self.model.q_word_ids: padded_q_word_ids,
            self.model.q_char_ids: padded_q_char_ids,
            self.model.q_sequence_lengths: q_sequence_lengths,
            self.model.q_word_lengths: q_word_lengths,
            self.model.keep_op: 1.0,
            self.model.is_training: False
        }
        question_embeddings = self.model.sess.run(self.model.q_dense,
                                                  feed_dict=feed_dict)
        q = question_embeddings[0]  # 1, 300
        rq = question_embeddings[1:]
        scores = np.sum(np.square(rq - q), axis=-1)

        q_min = scores.argsort()[0]
        return q_min
Пример #2
0
    def _next_batch(self, data, num_batch):
        start = 0
        idx = 0
        while idx < num_batch:
            word_ids = data['words'][start:start + self.batch_size]
            pos_ids = data['poses'][start:start + self.batch_size]
            labels = data['labels'][start:start + self.batch_size]
            relation_ids = data['relations'][start:start + self.batch_size]

            relation_ids, sequence_lengths_re = pad_sequences(relation_ids, pad_tok=0, max_sent_length=self.max_length)

            # Word - relation - word
            word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=0, max_sent_length=self.max_length)
            relation_wd_ids = self.embeddings.shape[0] + relation_ids
            word_relation_ids = np.zeros((word_ids.shape[0], word_ids.shape[1] + relation_wd_ids.shape[1]))
            word_relation_ids[:, ::2] = word_ids
            word_relation_ids[:, 1::2] = relation_wd_ids

            # Pos - relation - pos
            pos_ids, sequence_lengths = pad_sequences(pos_ids, pad_tok=0, max_sent_length=self.max_length)
            relation_pos_ids = self.num_of_pos + 1 + relation_ids
            pos_relation_ids = np.zeros((pos_ids.shape[0], pos_ids.shape[1] + relation_pos_ids.shape[1]))
            pos_relation_ids[:, ::2] = pos_ids
            pos_relation_ids[:, 1::2] = relation_pos_ids

            start += self.batch_size
            idx += 1
            yield word_relation_ids, pos_relation_ids, labels, relation_ids, sequence_lengths, sequence_lengths_re
Пример #3
0
    def _next_batch(self, data, num_batch):
        start = 0
        idx = 0
        while idx < num_batch:
            w_batch = data['words'][start:start + self.batch_size]
            l_batch = data['labels'][start:start + self.batch_size]

            word_ids, sequence_lengths = pad_sequences(w_batch, pad_tok=0)
            labels, _ = pad_sequences(l_batch, pad_tok=0)

            start += self.batch_size
            idx += 1
            yield (word_ids, labels, sequence_lengths)
Пример #4
0
 def get_feed_dict(self, seqs, labels=None, lr=None, dropout=None):
     word_ids, seq_len_list = pad_sequences(seqs, pad_mark=0)
     feed_dict = {
         self.word_ids: word_ids,
         self.sequence_lengths: seq_len_list
     }
     if labels is not None:
         labels_, _ = pad_sequences(labels, pad_mark=0)
         feed_dict[self.labels] = labels_
     if lr is not None:
         feed_dict[self.lr_pl] = lr
     if dropout is not None:
         feed_dict[self.dropout_pl] = dropout
     return feed_dict, seq_len_list
Пример #5
0
    def get_scores(self, question, contexts):
        scores = []

        num_batch = len(contexts) // self.batch_size + 1
        start = 0
        idx = 0
        while idx < num_batch:
            context_batch = contexts[start:start + self.batch_size]
            if not context_batch:
                break

            q_char_ids, q_word_ids = zip(
                *[zip(*zip(*x)) for x in [question] * len(context_batch)])
            padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids,
                                                                  pad_tok=0)
            padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids,
                                                              pad_tok=0,
                                                              nlevels=2)

            c_char_ids, c_word_ids = zip(
                *[zip(*zip(*x)) for x in context_batch])
            padded_c_word_ids, c_sequence_lengths = pad_sequences(c_word_ids,
                                                                  pad_tok=0)
            padded_c_char_ids, c_word_lengths = pad_sequences(c_char_ids,
                                                              pad_tok=0,
                                                              nlevels=2)

            start += self.batch_size
            idx += 1

            feed_dict = {
                self.cp_char_ids: padded_c_char_ids,
                self.cp_word_ids: padded_c_word_ids,
                self.cp_sequence_lengths: c_sequence_lengths,
                self.cp_word_lengths: c_word_lengths,
                self.q_word_ids: padded_q_word_ids,
                self.q_char_ids: padded_q_char_ids,
                self.q_sequence_lengths: q_sequence_lengths,
                self.q_word_lengths: q_word_lengths,
                self.keep_op: 1.0,
                self.is_training: False
            }
            scores.extend(self.sess.run(self.p_score, feed_dict=feed_dict))
        return np.array(scores)
Пример #6
0
    def _next_batch(self, data, num_batch):
        start = 0
        idx = 0

        while idx < num_batch:
            example_batch = data[start:start + self.batch_size]
            if not example_batch:
                break

            q_batch, cp_batch, cn_batch = zip(*example_batch)

            q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in q_batch])
            padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids,
                                                                  pad_tok=0)
            padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids,
                                                              pad_tok=0,
                                                              nlevels=2)

            cp_char_ids, cp_word_ids = zip(*[zip(*zip(*x)) for x in cp_batch])
            padded_cp_word_ids, cp_sequence_lengths = pad_sequences(
                cp_word_ids, pad_tok=0)
            padded_cp_char_ids, cp_word_lengths = pad_sequences(cp_char_ids,
                                                                pad_tok=0,
                                                                nlevels=2)

            cn_char_ids, cn_word_ids = zip(*[zip(*zip(*x)) for x in cn_batch])
            padded_cn_word_ids, cn_sequence_lengths = pad_sequences(
                cn_word_ids, pad_tok=0)
            padded_cn_char_ids, cn_word_lengths = pad_sequences(cn_char_ids,
                                                                pad_tok=0,
                                                                nlevels=2)

            start += self.batch_size
            idx += 1
            yield (padded_q_word_ids, padded_q_char_ids, q_sequence_lengths,
                   q_word_lengths, padded_cp_word_ids, padded_cp_char_ids,
                   cp_sequence_lengths, cp_word_lengths, padded_cn_word_ids,
                   padded_cn_char_ids, cn_sequence_lengths, cn_word_lengths)
Пример #7
0
 def _get_word_ids(self, X):
     word_ids = [x[1] for x in X]
     word_ids, _ = pad_sequences(word_ids, pad_tok=0, fix_length=self.max_input_word)
     return word_ids
Пример #8
0
    def _next_batch(self, data, num_batch):
        start = 0
        idx = 0
        while idx < num_batch:
            # Get BATCH_SIZE samples each batch
            word_ids = data['words'][start:start + self.batch_size]
            sibling_ids = data['siblings'][start:start + self.batch_size]
            positions_1 = data['positions_1'][start:start + self.batch_size]
            positions_2 = data['positions_2'][start:start + self.batch_size]
            pos_ids = data['poses'][start:start + self.batch_size]
            synset_ids = data['synsets'][start:start + self.batch_size]
            relation_ids = data['relations'][start:start + self.batch_size]
            directions = data['directions'][start:start + self.batch_size]
            labels = data['labels'][start:start + self.batch_size]

            # Padding sentences to the length of longest one
            word_ids, _ = pad_sequences(word_ids,
                                        pad_tok=0,
                                        max_sent_length=self.max_length)
            sibling_ids, _ = pad_sequences(sibling_ids,
                                           pad_tok=0,
                                           max_sent_length=self.max_length)
            positions_1, _ = pad_sequences(positions_1,
                                           pad_tok=0,
                                           max_sent_length=self.max_length)
            positions_2, _ = pad_sequences(positions_2,
                                           pad_tok=0,
                                           max_sent_length=self.max_length)
            pos_ids, _ = pad_sequences(pos_ids,
                                       pad_tok=0,
                                       max_sent_length=self.max_length)
            synset_ids, _ = pad_sequences(synset_ids,
                                          pad_tok=0,
                                          max_sent_length=self.max_length)
            relation_ids, _ = pad_sequences(relation_ids,
                                            pad_tok=0,
                                            max_sent_length=self.max_length)
            directions, _ = pad_sequences(directions,
                                          pad_tok=0,
                                          max_sent_length=self.max_length)

            # print("words: ", word_ids.shape)
            # print("siblings: ", sibling_ids.shape)

            # Create index matrix with words and dependency relations between words
            new_relation_ids = self.embeddings.shape[
                0] + relation_ids + directions
            word_relation_ids = np.zeros(
                (word_ids.shape[0],
                 word_ids.shape[1] + new_relation_ids.shape[1]))
            w_ids, rel_idxs = [], []
            for j in range(word_ids.shape[1] + new_relation_ids.shape[1]):
                if j % 2 == 0:
                    w_ids.append(j)
                else:
                    rel_idxs.append(j)
            word_relation_ids[:, w_ids] = word_ids
            word_relation_ids[:, rel_idxs] = new_relation_ids

            # Create index matrix with pos tags and dependency relations between pos tags
            new_relation_ids = self.num_of_siblings + 1 + relation_ids + directions
            sibling_relation_ids = np.zeros(
                (sibling_ids.shape[0],
                 sibling_ids.shape[1] + new_relation_ids.shape[1]))
            sibling_relation_ids[:, w_ids] = sibling_ids
            sibling_relation_ids[:, rel_idxs] = new_relation_ids

            # Create index matrix with pos tags and dependency relations between pos tags
            new_relation_ids = self.num_of_pos + 1 + relation_ids + directions
            pos_relation_ids = np.zeros(
                (pos_ids.shape[0],
                 pos_ids.shape[1] + new_relation_ids.shape[1]))
            pos_relation_ids[:, w_ids] = pos_ids
            pos_relation_ids[:, rel_idxs] = new_relation_ids

            # Create index matrix with synsets and dependency relations between synsets
            new_relation_ids = self.num_of_synset + 1 + relation_ids + directions
            synset_relation_ids = np.zeros(
                (synset_ids.shape[0],
                 synset_ids.shape[1] + new_relation_ids.shape[1]))
            synset_relation_ids[:, w_ids] = synset_ids
            synset_relation_ids[:, rel_idxs] = new_relation_ids

            # Create index matrix with positions and dependency relations between positions
            new_relation_ids = self.max_length + 1 + relation_ids + directions
            positions_1_relation_ids = np.zeros(
                (positions_1.shape[0],
                 positions_1.shape[1] + new_relation_ids.shape[1]))
            positions_1_relation_ids[:, w_ids] = positions_1
            positions_1_relation_ids[:, rel_idxs] = new_relation_ids

            # Create index matrix with positions and dependency relations between positions
            positions_2_relation_ids = np.zeros(
                (positions_2.shape[0],
                 positions_2.shape[1] + new_relation_ids.shape[1]))
            positions_2_relation_ids[:, w_ids] = positions_2
            positions_2_relation_ids[:, rel_idxs] = new_relation_ids

            start += self.batch_size
            idx += 1
            yield positions_1_relation_ids, positions_2_relation_ids, \
                  word_relation_ids, sibling_relation_ids, pos_relation_ids, synset_relation_ids, relation_ids, labels