def build_model_training_(self):

        encoder_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ENCODER_NAMESPACE)
        encoder_optimizer = agent_utils.get_optimizer(self.optimizer, self.learning_rate)

        self.encoder_train_step = encoder_optimizer.minimize(
            self.loss_t, global_step=self.global_step, var_list=encoder_variables
        )
    def build_prior_training_(self):

        self.mixture_variables = [self.mixtures_mu_v, self.mixtures_logvar_v, self.t_v]
        mixture_optimizer = agent_utils.get_optimizer(self.optimizer, self.model_learning_rate)

        self.prior_train_step = mixture_optimizer.minimize(
            self.prior_loss_t, global_step=self.global_step, var_list=self.mixture_variables
        )
    def learn_model(self, state_probs, next_state_probs, actions, rewards, dones, rlr=0.01, tlr=0.5, num_steps=500,
                    ad_hoc_loss=False, batch_size=1000, opt=constants.OPT_ADAM):

        dataset = ArrayDataset({
            constants.STATE_PROBABILITIES: state_probs, constants.NEXT_STATE_PROBABILITIES: next_state_probs,
            constants.ACTIONS: actions, constants.REWARDS: rewards, constants.DONES: dones
        })

        with tf.variable_scope("learn_model"):

            R = tf.get_variable(
                "reward_matrix", shape=(len(self.actions), len(self.states)), dtype=tf.float32,
                initializer=tf.random_uniform_initializer(minval=0, maxval=1, dtype=tf.float32)
            )

            T = tf.get_variable(
                "transition_matrix", shape=(len(self.actions), len(self.states), len(self.states)), dtype=tf.float32,
                initializer=tf.random_uniform_initializer(minval=-1, maxval=1, dtype=tf.float32)
            )
            T_softmax = tf.nn.softmax(T, axis=2)
            T_logsoftmax = tf.nn.log_softmax(T, axis=2)

            state_probs_pl = tf.placeholder(dtype=tf.float32, name="state_probs_pl")
            next_state_probs_pl = tf.placeholder(dtype=tf.float32, name="next_state_probs_pl")
            actions_pl = tf.placeholder(dtype=tf.int32, name="actions_pl")
            rewards_pl = tf.placeholder(dtype=tf.float32, name="rewards_pl")
            dones_pl = tf.placeholder(dtype=tf.bool, name="dones_pl")

            R_gather = tf.gather(R, actions_pl)
            T_softmax_gather = tf.gather(T_softmax, actions_pl)
            T_logsoftmax_gather = tf.gather(T_logsoftmax, actions_pl)

            if ad_hoc_loss:
                reward_loss = (1 / 2) * tf.reduce_mean(
                    tf.square(rewards_pl - tf.reduce_sum(R_gather * state_probs_pl, axis=1)), axis=0
                )

                transition_loss = (1 / 2) * tf.reduce_mean(tf.reduce_sum(
                    tf.square(
                        next_state_probs_pl - tf.matmul(tf.transpose(T_softmax_gather, perm=[0, 2, 1]),
                                                       state_probs_pl[:, :, tf.newaxis])[:, :, 0]
                    ), axis=1
                ), axis=0)
            else:
                reward_loss = (1 / 2) * tf.reduce_mean(
                    tf.reduce_sum(tf.square(rewards_pl[:, tf.newaxis] - R_gather) * state_probs_pl, axis=1), axis=0
                )

                transition_loss = - tf.reduce_mean(tf.reduce_sum(
                    state_probs_pl[:, :, tf.newaxis] * next_state_probs_pl[:, tf.newaxis, :] * T_logsoftmax_gather,
                    axis=[1, 2]
                ) * (1 - tf.cast(dones_pl, tf.float32)), axis=0)

            R_step = utils.get_optimizer(opt, rlr).minimize(reward_loss)
            T_step = utils.get_optimizer(opt, tlr).minimize(transition_loss)

            losses = []
            epoch_size = max(len(state_probs) // batch_size, 1)

            with tf.Session() as sess:

                sess.run(tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="learn_model"))
                )

                for i in range(num_steps):

                    epoch_step = i % epoch_size

                    if epoch_step == 0:
                        dataset.shuffle()

                    b = np.index_exp[epoch_step * batch_size: (epoch_step + 1) * batch_size]

                    _, _, tmp_reward_loss, tmp_transition_loss = sess.run(
                        [R_step, T_step, reward_loss, transition_loss], feed_dict={
                            state_probs_pl: dataset[constants.STATE_PROBABILITIES][b],
                            next_state_probs_pl: dataset[constants.NEXT_STATE_PROBABILITIES][b],
                            actions_pl: dataset[constants.ACTIONS][b],
                            rewards_pl: dataset[constants.REWARDS][b],
                            dones_pl: dataset[constants.DONES][b]
                        }
                    )

                    losses.append([tmp_reward_loss, tmp_transition_loss])

                self.rewards = np.transpose(sess.run(R), axes=[1, 0])
                self.transitions = np.transpose(sess.run(T_softmax), axes=[1, 2, 0])

            losses = np.stack(losses, axis=0)

        return losses