예제 #1
0
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_subjs = tf.get_variable("E_s", [len(self._kb.get_symbols(1)), self._size])
            E_objs = tf.get_variable("E_o", [len(self._kb.get_symbols(2)), self._size])
            E_rels_s = tf.get_variable("E_r_s", [len(self._kb.get_symbols(0)), self._size])
            E_rels_o = tf.get_variable("E_r_o", [len(self._kb.get_symbols(0)), self._size])

        self.e_subj = tf.tanh(tf.nn.embedding_lookup(E_subjs, self._subj_input))
        self.e_obj = tf.tanh(tf.nn.embedding_lookup(E_objs, self._obj_input))
        self.e_rel_s = tf.tanh(tf.nn.embedding_lookup(E_rels_s, self._rel_input))
        self.e_rel_o = tf.tanh(tf.nn.embedding_lookup(E_rels_o, self._rel_input))

        score = tf_util.batch_dot(self.e_rel_s, self.e_subj) + tf_util.batch_dot(self.e_rel_o, self.e_obj)

        return score
예제 #2
0
파일: models.py 프로젝트: dokal01/genie-kb
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_rels = tf.get_variable(
                "E_r", [len(self._kb.get_symbols(0)), self._size])
            E_tup_rels = tf.get_variable(
                "E_tup_r", [2 * self._num_relations + 1, self._size
                            ])  # rels + inv rels + default rel

        blur_factor = tf.get_variable("blur",
                                      shape=[1],
                                      initializer=tf.constant_initializer(0.0))
        blur_factor = tf.sigmoid(blur_factor)
        # duplicate rels to fit with observations
        e_rel = tf.gather(
            tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input)),
            self._gather_rels_input)
        e_tup_rels = tf.tanh(
            tf.nn.embedding_lookup(E_tup_rels, self._sparse_values_input))

        scores_flat = tf_util.batch_dot(e_rel, e_tup_rels)
        # for softmax set empty cells to something very small, so weight becomes practically zero
        scores = tf.sparse_to_dense(self._sparse_indices_input,
                                    self._shape_input,
                                    scores_flat,
                                    default_value=-1e-3)
        softmax = tf.nn.softmax(scores * blur_factor)
        weighted_scores = tf.reduce_sum(scores * softmax,
                                        reduction_indices=[1],
                                        keep_dims=False)

        return weighted_scores
예제 #3
0
    def _scoring_f(self):
        with tf.device("/cpu:0"):
           E_rels = tf.get_variable("E_r", [len(self._kb.get_symbols(0)), self._size])
           E_tups = tf.get_variable("E_t", [len(self.__tuple_lookup), self._size])

        self.e_rel = tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input))
        self.e_tup = tf.tanh(tf.nn.embedding_lookup(E_tups, self._tuple_input))

        return tf_util.batch_dot(self.e_rel, self.e_tup)
예제 #4
0
파일: models.py 프로젝트: dokal01/genie-kb
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_rels = tf.get_variable(
                "E_r", [len(self._kb.get_symbols(0)), self._size])
            E_tups = tf.get_variable("E_t",
                                     [len(self.__tuple_lookup), self._size])

        self.e_rel = tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input))
        self.e_tup = tf.tanh(tf.nn.embedding_lookup(E_tups, self._tuple_input))

        return tf_util.batch_dot(self.e_rel, self.e_tup)
예제 #5
0
    def _scoring_f(self):
        with tf.device("/cpu:0"):
           E_rels = tf.get_variable("E_r", [len(self._kb.get_symbols(0)), self._size])
           E_tup_rels = tf.get_variable("E_tup_r", [2 * self._num_relations + 1, self._size])  # rels + inv rels + default rel

        self.e_rel = tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input))
        # weighted sum of tuple rel embeddings
        sparse_tensor = tf.SparseTensor(self._sparse_indices_input, self._sparse_values_input, self._shape_input)
        # mean embedding
        self.e_tuple_rels = tf.tanh(tf.nn.embedding_lookup_sparse(E_tup_rels, sparse_tensor, None))

        return tf_util.batch_dot(self.e_rel, self.e_tuple_rels)
예제 #6
0
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_subjs = tf.get_variable("E_s", [len(self._kb.get_symbols(1)), self._size])
            E_objs = tf.get_variable("E_o", [len(self._kb.get_symbols(2)), self._size])
            E_rels = tf.get_variable("E_r", [len(self._kb.get_symbols(0)), self._size])

        self.e_subj = tf.tanh(tf.nn.embedding_lookup(E_subjs, self._subj_input))
        self.e_obj = tf.tanh(tf.nn.embedding_lookup(E_objs, self._obj_input))
        self.e_rel = tf.sigmoid(tf.nn.embedding_lookup(E_rels, self._rel_input))
        s_o_prod = self.e_obj * self.e_subj

        score = tf_util.batch_dot(self.e_rel, s_o_prod)

        return score
예제 #7
0
파일: models.py 프로젝트: dokal01/genie-kb
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_subjs = tf.get_variable(
                "E_s", [len(self._kb.get_symbols(1)), self._size])
            E_objs = tf.get_variable(
                "E_o", [len(self._kb.get_symbols(2)), self._size])
            E_rels_s = tf.get_variable(
                "E_r_s", [len(self._kb.get_symbols(0)), self._size])
            E_rels_o = tf.get_variable(
                "E_r_o", [len(self._kb.get_symbols(0)), self._size])

        self.e_subj = tf.tanh(tf.nn.embedding_lookup(E_subjs,
                                                     self._subj_input))
        self.e_obj = tf.tanh(tf.nn.embedding_lookup(E_objs, self._obj_input))
        self.e_rel_s = tf.tanh(
            tf.nn.embedding_lookup(E_rels_s, self._rel_input))
        self.e_rel_o = tf.tanh(
            tf.nn.embedding_lookup(E_rels_o, self._rel_input))

        score = tf_util.batch_dot(self.e_rel_s,
                                  self.e_subj) + tf_util.batch_dot(
                                      self.e_rel_o, self.e_obj)

        return score
예제 #8
0
    def _scoring_f(self):
        with tf.device("/cpu:0"):
           E_rels = tf.get_variable("E_r", [len(self._kb.get_symbols(0)), self._size])
           E_tup_rels = tf.get_variable("E_tup_r", [2 * self._num_relations + 1, self._size])  # rels + inv rels + default rel

        # duplicate rels to fit with observations
        e_rel = tf.gather(tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input)), self._gather_rels_input)
        e_tup_rels = tf.tanh(tf.nn.embedding_lookup(E_tup_rels, self._sparse_values_input))

        scores_flat = tf_util.batch_dot(e_rel, e_tup_rels)
        # for softmax set empty cells to something very small, so weight becomes practically zero
        scores = tf.sparse_to_dense(self._sparse_indices_input, self._shape_input,
                                    scores_flat, default_value=-1e-3)
        softmax = tf.nn.softmax(scores)
        weighted_scores = tf.reduce_sum(scores * softmax, reduction_indices=[1], keep_dims=False)

        return weighted_scores
예제 #9
0
파일: models.py 프로젝트: dokal01/genie-kb
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_rels = tf.get_variable(
                "E_r", [len(self._kb.get_symbols(0)), self._size])
            E_tup_rels = tf.get_variable(
                "E_tup_r", [2 * self._num_relations + 1, self._size
                            ])  # rels + inv rels + default rel

        self.e_rel = tf.tanh(tf.nn.embedding_lookup(E_rels, self._rel_input))
        # weighted sum of tuple rel embeddings
        sparse_tensor = tf.SparseTensor(self._sparse_indices_input,
                                        self._sparse_values_input,
                                        self._shape_input)
        # mean embedding
        self.e_tuple_rels = tf.tanh(
            tf.nn.embedding_lookup_sparse(E_tup_rels, sparse_tensor, None))

        return tf_util.batch_dot(self.e_rel, self.e_tuple_rels)
예제 #10
0
파일: models.py 프로젝트: dokal01/genie-kb
    def _scoring_f(self):
        with tf.device("/cpu:0"):
            E_subjs = tf.get_variable(
                "E_s", [len(self._kb.get_symbols(1)), self._size])
            E_objs = tf.get_variable(
                "E_o", [len(self._kb.get_symbols(2)), self._size])
            E_rels = tf.get_variable(
                "E_r", [len(self._kb.get_symbols(0)), self._size])

        self.e_subj = tf.tanh(tf.nn.embedding_lookup(E_subjs,
                                                     self._subj_input))
        self.e_obj = tf.tanh(tf.nn.embedding_lookup(E_objs, self._obj_input))
        self.e_rel = tf.sigmoid(tf.nn.embedding_lookup(E_rels,
                                                       self._rel_input))
        s_o_prod = self.e_obj * self.e_subj

        score = tf_util.batch_dot(self.e_rel, s_o_prod)

        return score
예제 #11
0
    def _retrieve_answer(self, query):
        """
        Retrieves answer based on the specified query. Implements consecutive updates to the query and answer.
        :return: answer, if num_hops is 0, returns query itself
        """
        query, supp_queries = tf.dynamic_partition(query,
                                                   self._query_partition, 2)
        with tf.variable_scope("support"):
            num_queries = tf.shape(query)[0]

            with tf.device("/cpu:0"):
                _, supp_answer_output_ids = tf.dynamic_partition(
                    self._answer_input, self._query_partition, 2)
                _, supp_answer_input_ids = tf.dynamic_partition(
                    self._answer_word_input, self._query_partition, 2)
                supp_answers = tf.nn.embedding_lookup(self.output_embedding,
                                                      supp_answer_output_ids)
                aligned_supp_answers = tf.gather(
                    supp_answers,
                    self._support_ids)  # and with respective answers

                if self._max_hops > 1:
                    # used in multihop
                    answer_words = tf.nn.embedding_lookup(
                        self.input_embedding, supp_answer_input_ids)
                    aligned_answers_input = tf.gather(answer_words,
                                                      self._support_ids)

            self.support_scores = []
            query_as_answer = tf.contrib.layers.fully_connected(
                query,
                self._size,
                activation_fn=None,
                weights_initializer=None,
                biases_initializer=None,
                scope="query_to_answer")
            query_as_answer = query_as_answer * tf.sigmoid(
                tf.get_variable("query_as_answer_gate",
                                tuple(),
                                initializer=tf.constant_initializer(0.0)))
            current_answer = query_as_answer
            current_query = query

            aligned_support = tf.gather(
                supp_queries,
                self._support_ids)  # align supp_queries with queries
            collab_support = tf.gather(
                query,
                self._collab_support_ids)  # align supp_queries with queries
            aligned_support = tf.concat(0, [aligned_support, collab_support])

            query_ids = tf.concat(0, [self._query_ids, self._collab_query_ids])
            self.answer_weights = []

            for i in range(self._max_hops):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                collab_queries = tf.gather(
                    current_query,
                    self._collab_query_ids)  # align supp_queries with queries
                aligned_queries = tf.gather(current_query,
                                            self._query_ids)  # align queries
                aligned_queries = tf.concat(0,
                                            [aligned_queries, collab_queries])

                with tf.variable_scope("support_scores"):
                    scores = tf_util.batch_dot(aligned_queries,
                                               aligned_support)
                    self.support_scores.append(scores)
                    score_max = tf.gather(tf.segment_max(scores, query_ids),
                                          query_ids)
                    e_scores = tf.exp(scores - score_max)
                    norm = tf.unsorted_segment_sum(
                        e_scores, query_ids,
                        num_queries) + 0.00001  # for zero norms
                    norm = tf.expand_dims(norm, 1)
                    e_scores = tf.expand_dims(e_scores, 1)

                with tf.variable_scope("support_answers"):
                    aligned_supp_answers_with_collab = tf.concat(
                        0, [aligned_supp_answers, collab_queries])
                    weighted_supp_answers = tf.unsorted_segment_sum(
                        e_scores * aligned_supp_answers_with_collab, query_ids,
                        num_queries) / norm

                with tf.variable_scope("support_queries"):
                    weighted_supp_queries = tf.unsorted_segment_sum(
                        e_scores * aligned_support, query_ids,
                        num_queries) / norm

                with tf.variable_scope("answer_accumulation"):
                    answer_p_max = tf.reduce_max(tf.nn.softmax(
                        self._score_candidates(weighted_supp_answers)), [1],
                                                 keep_dims=True)
                    answer_weight = tf.contrib.layers.fully_connected(
                        tf.concat(1, [
                            query_as_answer * weighted_supp_answers,
                            weighted_supp_queries * current_query, answer_p_max
                        ]),
                        1,
                        activation_fn=tf.nn.sigmoid,
                        weights_initializer=tf.constant_initializer(0.0),
                        biases_initializer=tf.constant_initializer(0.0),
                        scope="answer_weight")

                    new_answer = answer_weight * weighted_supp_answers + current_answer

                    # this condition allows for setting varying number of hops
                    current_answer = tf.cond(tf.greater(self.num_hops,
                                                        i), lambda: new_answer,
                                             lambda: current_answer)

                    self.answer_weights.append(answer_weight)

                if i < self._max_hops - 1:
                    with tf.variable_scope("query_update"):
                        # prepare subsequent query
                        aligned_answers_input_with_collab = tf.concat(
                            0, [aligned_answers_input, collab_queries])
                        weighted_answer_words = tf.unsorted_segment_sum(
                            e_scores * aligned_answers_input_with_collab,
                            query_ids, num_queries) / norm

                        c = tf.contrib.layers.fully_connected(
                            tf.concat(1, [
                                current_query, weighted_supp_queries,
                                weighted_answer_words
                            ]),
                            self._size,
                            activation_fn=tf.tanh,
                            scope="update_candidate",
                            weights_initializer=None,
                            biases_initializer=None)

                        gate = tf.contrib.layers.fully_connected(
                            tf.concat(1,
                                      [current_query, weighted_supp_queries]),
                            self._size,
                            activation_fn=tf.sigmoid,
                            weights_initializer=None,
                            scope="update_gate",
                            biases_initializer=tf.constant_initializer(1))
                        current_query = gate * current_query + (1 - gate) * c

            return current_answer
예제 #12
0
    def __init__(self,
                 size,
                 batch_size,
                 vocab_size,
                 answer_vocab_size,
                 max_length,
                 is_train=True,
                 learning_rate=1e-2,
                 composition="GRU",
                 max_hops=0,
                 devices=None,
                 keep_prob=1.0):
        """
        :param size: size of hidden states
        :param batch_size: initial batch_size (adapts automatically)
        :param vocab_size: size of input vocabulary (vocabulary of contexts)
        :param answer_vocab_size: size of answer (candidates) vocabulary
        :param max_length: maximum length of an individual context
        :param is_train:
        :param learning_rate:
        :param composition: "GRU", "LSTM", "BiGRU" are possible
        :param max_hops: maximum number of hops, can be set manually to something lower by assigning a different value
        to variable (self.)num_hops which is initialized with max_hops
        :param devices: defaults to ["/cpu:0"], but can be a list of up to 3 devices. The model is automatically
        partitioned into the different devices.
        :param keep_prob: 1.0-dropout rate, that is applied to the input embeddings
        """
        self._vocab_size = vocab_size
        self._max_length = max_length
        self._size = size
        self._batch_size = batch_size
        self._is_train = is_train
        self._composition = composition
        self._max_hops = max_hops
        self._device0 = devices[0] if devices is not None else "/cpu:0"
        self._device1 = devices[
            1 % len(devices)] if devices is not None else "/cpu:0"
        self._device2 = devices[
            2 % len(devices)] if devices is not None else "/cpu:0"

        self._init = tf.random_normal_initializer(0.0, 0.1)
        with tf.device(self._device0):
            with tf.variable_scope(
                    self.name(),
                    initializer=tf.contrib.layers.xavier_initializer()):
                self._init_inputs()
                self.keep_prob = tf.get_variable(
                    "keep_prob", [],
                    initializer=tf.constant_initializer(keep_prob))
                with tf.device("/cpu:0"):
                    # embeddings
                    self.output_embedding = tf.get_variable(
                        "E_candidate", [answer_vocab_size, self._size],
                        initializer=self._init)
                    self.input_embedding = tf.get_variable(
                        "E_words", [vocab_size, self._size],
                        initializer=self._init)
                    answer, _ = tf.dynamic_partition(self._answer_input,
                                                     self._query_partition, 2)
                    lookup_individual = tf.nn.embedding_lookup(
                        self.output_embedding, answer)
                    cands, _ = tf.dynamic_partition(self._answer_candidates,
                                                    self._query_partition, 2)
                    self.candidate_lookup = tf.nn.embedding_lookup(
                        self.output_embedding, cands)

                self.num_hops = tf.Variable(self._max_hops,
                                            trainable=False,
                                            name="num_queries")
                self.query = self._comp_f()
                answer = self._retrieve_answer(self.query)
                self.score = tf_util.batch_dot(lookup_individual, answer)
                self.scores_with_negs = self._score_candidates(answer)

                if is_train:
                    self.learning_rate = tf.Variable(float(learning_rate),
                                                     trainable=False,
                                                     name="lr")
                    self.global_step = tf.Variable(0,
                                                   trainable=False,
                                                   name="step")

                    self.opt = tf.train.AdamOptimizer(self.learning_rate)

                    current_batch_size = tf.gather(
                        tf.shape(self.scores_with_negs), [0])

                    loss = math_ops.reduce_sum(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(
                            self.scores_with_negs,
                            tf.tile(tf.constant([0], tf.int64),
                                    current_batch_size)))

                    train_params = tf.trainable_variables()
                    self.training_weight = tf.Variable(1.0,
                                                       trainable=False,
                                                       name="training_weight")

                    self._loss = loss / math_ops.cast(current_batch_size,
                                                      tf.float32)
                    self._grads = tf.gradients(
                        self._loss,
                        train_params,
                        self.training_weight,
                        colocate_gradients_with_ops=True)

                    if len(train_params) > 0:
                        grads, _ = tf.clip_by_global_norm(self._grads, 5.0)
                        self._update = self.opt.apply_gradients(
                            zip(grads, train_params),
                            global_step=self.global_step)
                    else:
                        self._update = tf.assign_add(self.global_step, 1)
        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1)
예제 #13
0
    def _retrieve_answer(self, query):
        """
        Retrieves answer based on the specified query. Implements consecutive updates to the query and answer.
        :return: answer, if num_hops is 0, returns query itself
        """
        query, supp_queries = tf.dynamic_partition(query, self._query_partition, 2)
        with tf.variable_scope("support"):
            num_queries = tf.shape(query)[0]

            with tf.device("/cpu:0"):
                _, supp_answer_output_ids = tf.dynamic_partition(self._answer_input, self._query_partition, 2)
                _, supp_answer_input_ids = tf.dynamic_partition(self._answer_word_input, self._query_partition, 2)
                supp_answers = tf.nn.embedding_lookup(self.output_embedding, supp_answer_output_ids)
                aligned_supp_answers = tf.gather(supp_answers, self._support_ids)  # and with respective answers

                if self._max_hops > 1:
                    # used in multihop
                    answer_words = tf.nn.embedding_lookup(self.input_embedding, supp_answer_input_ids)
                    aligned_answers_input = tf.gather(answer_words, self._support_ids)

            self.support_scores = []
            query_as_answer = tf.contrib.layers.fully_connected(query, self._size,
                                                                activation_fn=None, weights_initializer=None,
                                                                biases_initializer=None, scope="query_to_answer")
            query_as_answer = query_as_answer * tf.sigmoid(tf.get_variable("query_as_answer_gate", tuple(),
                                                                           initializer=tf.constant_initializer(0.0)))
            current_answer = query_as_answer
            current_query = query

            aligned_support = tf.gather(supp_queries, self._support_ids)  # align supp_queries with queries
            collab_support = tf.gather(query, self._collab_support_ids)  # align supp_queries with queries
            aligned_support = tf.concat(0, [aligned_support, collab_support])

            query_ids = tf.concat(0, [self._query_ids, self._collab_query_ids])
            self.answer_weights = []


            for i in range(self._max_hops):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                collab_queries = tf.gather(current_query, self._collab_query_ids)  # align supp_queries with queries
                aligned_queries = tf.gather(current_query, self._query_ids)  # align queries
                aligned_queries = tf.concat(0, [aligned_queries, collab_queries])

                with tf.variable_scope("support_scores"):
                    scores = tf_util.batch_dot(aligned_queries, aligned_support)
                    self.support_scores.append(scores)
                    score_max = tf.gather(tf.segment_max(scores, query_ids), query_ids)
                    e_scores = tf.exp(scores - score_max)
                    norm = tf.unsorted_segment_sum(e_scores, query_ids, num_queries) + 0.00001 # for zero norms
                    norm = tf.expand_dims(norm, 1)
                    e_scores = tf.expand_dims(e_scores, 1)

                with tf.variable_scope("support_answers"):
                    aligned_supp_answers_with_collab = tf.concat(0, [aligned_supp_answers, collab_queries])
                    weighted_supp_answers = tf.unsorted_segment_sum(e_scores * aligned_supp_answers_with_collab,
                                                               query_ids, num_queries) / norm

                with tf.variable_scope("support_queries"):
                    weighted_supp_queries = tf.unsorted_segment_sum(e_scores * aligned_support, query_ids, num_queries) / norm
                
                with tf.variable_scope("answer_accumulation"):
                    answer_p_max = tf.reduce_max(tf.nn.softmax(self._score_candidates(weighted_supp_answers)), [1], keep_dims=True)
                    answer_weight = tf.contrib.layers.fully_connected(tf.concat(1, [query_as_answer * weighted_supp_answers,
                                                                                    weighted_supp_queries * current_query,
                                                                                    answer_p_max]),
                                                                      1,
                                                                      activation_fn=tf.nn.sigmoid,
                                                                      weights_initializer=tf.constant_initializer(0.0),
                                                                      biases_initializer=tf.constant_initializer(0.0),
                                                                      scope="answer_weight")

                    new_answer = answer_weight * weighted_supp_answers + current_answer

                    # this condition allows for setting varying number of hops
                    current_answer = tf.cond(tf.greater(self.num_hops, i),
                                             lambda: new_answer,
                                             lambda: current_answer)

                    self.answer_weights.append(answer_weight)

                if i < self._max_hops - 1:
                    with tf.variable_scope("query_update"):
                        # prepare subsequent query
                        aligned_answers_input_with_collab = tf.concat(0, [aligned_answers_input, collab_queries])
                        weighted_answer_words = tf.unsorted_segment_sum(e_scores * aligned_answers_input_with_collab,
                                                                        query_ids, num_queries) / norm

                        c = tf.contrib.layers.fully_connected(tf.concat(1, [current_query, weighted_supp_queries, weighted_answer_words]),
                                                              self._size, activation_fn=tf.tanh, scope="update_candidate",
                                                              weights_initializer=None, biases_initializer=None)

                        gate = tf.contrib.layers.fully_connected(tf.concat(1, [current_query, weighted_supp_queries]),
                                                                 self._size, activation_fn=tf.sigmoid,
                                                                 weights_initializer=None, scope="update_gate",
                                                                 biases_initializer=tf.constant_initializer(1))
                        current_query = gate * current_query + (1-gate) * c

            return current_answer
예제 #14
0
    def __init__(self, size, batch_size, vocab_size, answer_vocab_size, max_length, is_train=True, learning_rate=1e-2,
                 composition="GRU", max_hops=0, devices=None, keep_prob=1.0):
        """
        :param size: size of hidden states
        :param batch_size: initial batch_size (adapts automatically)
        :param vocab_size: size of input vocabulary (vocabulary of contexts)
        :param answer_vocab_size: size of answer (candidates) vocabulary
        :param max_length: maximum length of an individual context
        :param is_train:
        :param learning_rate:
        :param composition: "GRU", "LSTM", "BiGRU" are possible
        :param max_hops: maximum number of hops, can be set manually to something lower by assigning a different value
        to variable (self.)num_hops which is initialized with max_hops
        :param devices: defaults to ["/cpu:0"], but can be a list of up to 3 devices. The model is automatically
        partitioned into the different devices.
        :param keep_prob: 1.0-dropout rate, that is applied to the input embeddings
        """
        self._vocab_size = vocab_size
        self._max_length = max_length
        self._size = size
        self._batch_size = batch_size
        self._is_train = is_train
        self._composition = composition
        self._max_hops = max_hops
        self._device0 = devices[0] if devices is not None else "/cpu:0"
        self._device1 = devices[1 % len(devices)] if devices is not None else "/cpu:0"
        self._device2 = devices[2 % len(devices)] if devices is not None else "/cpu:0"

        self._init = tf.random_normal_initializer(0.0, 0.1)
        with tf.device(self._device0):
            with tf.variable_scope(self.name(), initializer=tf.contrib.layers.xavier_initializer()):
                self._init_inputs()
                self.keep_prob = tf.get_variable("keep_prob", [], initializer=tf.constant_initializer(keep_prob))
                with tf.device("/cpu:0"):
                    # embeddings
                    self.output_embedding = tf.get_variable("E_candidate", [answer_vocab_size, self._size],
                                                            initializer=self._init)
                    self.input_embedding = tf.get_variable("E_words", [vocab_size, self._size],
                                                           initializer=self._init)
                    answer, _ = tf.dynamic_partition(self._answer_input, self._query_partition, 2)
                    lookup_individual = tf.nn.embedding_lookup(self.output_embedding, answer)
                    cands, _ = tf.dynamic_partition(self._answer_candidates, self._query_partition, 2)
                    self.candidate_lookup = tf.nn.embedding_lookup(self.output_embedding, cands)

                self.num_hops = tf.Variable(self._max_hops, trainable=False, name="num_queries")
                self.query = self._comp_f()
                answer = self._retrieve_answer(self.query)
                self.score = tf_util.batch_dot(lookup_individual, answer)
                self.scores_with_negs = self._score_candidates(answer)

                if is_train:
                    self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name="lr")
                    self.global_step = tf.Variable(0, trainable=False, name="step")

                    self.opt = tf.train.AdamOptimizer(self.learning_rate)

                    current_batch_size = tf.gather(tf.shape(self.scores_with_negs), [0])

                    loss = math_ops.reduce_sum(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(self.scores_with_negs,
                                                                       tf.tile(tf.constant([0], tf.int64),
                                                                               current_batch_size)))

                    train_params = tf.trainable_variables()
                    self.training_weight = tf.Variable(1.0, trainable=False, name="training_weight")

                    self._loss = loss / math_ops.cast(current_batch_size, tf.float32)
                    self._grads = tf.gradients(self._loss, train_params, self.training_weight, colocate_gradients_with_ops=True)

                    if len(train_params) > 0:
                        grads, _ = tf.clip_by_global_norm(self._grads, 5.0)
                        self._update = self.opt.apply_gradients(zip(grads, train_params),
                                                                global_step=self.global_step)
                    else:
                        self._update = tf.assign_add(self.global_step, 1)
        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1)