Exemplo n.º 1
0
    def cal_validation_err(self, valid_input):
        """Return prediction MSE on the validation dataset."""
        stage1_weight = self.stage1_weight
        stage2_weight = self.value_func.weight
        se_sum = 0.
        se2_sum = 0.
        weight_sum = 0.
        for sample in valid_input:
            data = sample.data
            current_obs, action, reward = data[:3]
            d_tm1 = self._get_d_tm1(data)
            d_tm1 = tf.expand_dims(d_tm1, axis=1)
            instrumental_feature = self.instrumental_feature(obs=current_obs,
                                                             action=action,
                                                             training=False)
            predicted_feature = linear_reg_pred(instrumental_feature,
                                                stage1_weight)
            current_feature = add_const_col(
                self.value_feature(obs=current_obs,
                                   action=action,
                                   training=True))
            predicted_feature = current_feature - d_tm1 * self.discount * predicted_feature
            predict = linear_reg_pred(predicted_feature, stage2_weight)

            weight = d_tm1 + (1.0 - d_tm1) * tf.convert_to_tensor(
                self.d_tm1_weight, dtype=tf.float32)
            weight = tf.square(weight)
            sq_err = tf.square(tf.expand_dims(reward, -1) - predict)
            se_sum += tf.reduce_sum(weight * sq_err)
            se2_sum += tf.reduce_sum(weight * tf.square(sq_err))
            weight_sum += tf.reduce_sum(weight)
        mse = se_sum / weight_sum
        mse_err_std = tf.sqrt((se2_sum / weight_sum - mse**2) / weight_sum)
        return mse, mse_err_std
Exemplo n.º 2
0
    def update_final_weight(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)

        target_1st = discount_1st * add_const_col(
            self.value_feature(
                obs=next_obs_1st, action=next_action_1st, training=True))
        target_1st = add_const_col(
            self.value_feature(obs=current_obs_1st,
                               action=action_1st,
                               training=True)) - self.discount * target_1st
        stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                   self.stage1_reg)
        self.stage1_weight.assign(stage1_weight)
        predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                            stage1_weight)
        stage2_weight = fit_linear(tf.expand_dims(reward_2nd, -1),
                                   predicted_feature, self.stage2_reg)
        self.value_func.weight.assign(stage2_weight)

        return stage1_weight, stage2_weight
Exemplo n.º 3
0
    def update_value(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)
        l2 = snt.regularizers.L2(self.value_reg)
        with tf.GradientTape() as tape:
            target_1st = discount_1st * add_const_col(
                self.value_feature(
                    obs=next_obs_1st, action=next_action_1st, training=True))
            target_1st = add_const_col(
                self.value_feature(obs=current_obs_1st,
                                   action=action_1st,
                                   training=True)) - self.discount * target_1st
            stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                       self.stage1_reg)
            predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                                stage1_weight)
            loss = linear_reg_loss(tf.expand_dims(reward_2nd, -1),
                                   predicted_feature, self.stage2_reg)
            loss = loss + l2(self.value_feature.trainable_variables)
            loss /= action_2nd.shape[0]

        gradient = tape.gradient(loss, self.value_feature.trainable_variables)
        self._value_func_optimizer.apply(
            gradient, self.value_feature.trainable_variables)
        return loss
Exemplo n.º 4
0
    def score_stage2_reg(stage2_reg, predicted_feature_1st,
                         predicted_feature_2nd, reward_1st, reward_2nd):

        weight = fit_linear(reward_2nd, predicted_feature_2nd, stage2_reg)
        pred = linear_reg_pred(predicted_feature_1st, weight)
        loss = tf.reduce_sum((pred - reward_1st)**2).numpy()
        return loss, weight
Exemplo n.º 5
0
    def score_stage1_reg(stage1_reg, instrumental_feature_1st,
                         instrumental_feature_2nd, target_1st, target_2nd):

        weight = fit_linear(target_1st, instrumental_feature_1st, stage1_reg)
        pred = linear_reg_pred(instrumental_feature_2nd, weight)
        loss = tf.reduce_sum((pred - target_2nd)**2).numpy()
        return loss, weight
Exemplo n.º 6
0
    def cal_validation_err(self, valid_input):
        """Return prediction MSE on the validation dataset."""
        stage1_weight = self.stage1_weight
        stage2_weight = self.value_func.weight
        se_sum = 0.
        se2_sum = 0.
        count = 0.
        for sample in valid_input:
            data = sample.data
            current_obs, action, reward = data[:3]
            instrumental_feature = self.instrumental_feature(obs=current_obs,
                                                             action=action,
                                                             training=False)
            predicted_feature = linear_reg_pred(instrumental_feature,
                                                stage1_weight)
            predict = linear_reg_pred(predicted_feature, stage2_weight)

            sq_err = tf.square(tf.expand_dims(reward, -1) - predict)
            se_sum += tf.reduce_sum(sq_err)
            se2_sum += tf.reduce_sum(tf.square(sq_err))
            count += tf.cast(sq_err.shape[0], tf.float32)
        mse = se_sum / count
        mse_err_std = tf.sqrt((se2_sum / count - mse**2) / count)
        return mse, mse_err_std
Exemplo n.º 7
0
    def update_value(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        d_tm1_1st = self._get_d_tm1(stage1_input)
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        d_tm1_2nd = self._get_d_tm1(stage2_input)
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)
        d_tm1_1st = tf.expand_dims(d_tm1_1st, axis=1)
        d_tm1_2nd = tf.expand_dims(d_tm1_2nd, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_1st = d_tm1_1st * instrumental_feature_1st
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)
        l2 = snt.regularizers.L2(self.value_reg)
        with tf.GradientTape() as tape:
            # target_1st = discount_1st * self.value_feature(obs=next_obs_1st, action=next_action_1st, training=True)
            target_1st = d_tm1_1st * discount_1st * add_const_col(
                self.value_feature(
                    obs=next_obs_1st, action=next_action_1st, training=True))
            stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                       self.stage1_reg)
            predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                                stage1_weight)
            # current_feature = self.value_feature(obs=current_obs_2nd, action=action_2nd, training=True)
            current_feature = add_const_col(
                self.value_feature(obs=current_obs_2nd,
                                   action=action_2nd,
                                   training=True))
            predicted_feature = current_feature - d_tm1_2nd * self.discount * predicted_feature
            # loss = linear_reg_loss(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg)

            weight = d_tm1_2nd + (1.0 - d_tm1_2nd) * tf.convert_to_tensor(
                self.d_tm1_weight, dtype=tf.float32)
            loss = linear_reg_loss(weight * tf.expand_dims(reward_2nd, -1),
                                   weight * predicted_feature, self.stage2_reg)

            loss = loss + l2(self.value_feature.trainable_variables)
            loss /= action_2nd.shape[0]

        gradient = tape.gradient(loss, self.value_feature.trainable_variables)
        self._value_func_optimizer.apply(
            gradient, self.value_feature.trainable_variables)
        return loss
Exemplo n.º 8
0
    def update_final_weight(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, _, discount_1st, next_obs_1st = stage1_input[:
                                                                                  5]
        d_tm1_1st = self._get_d_tm1(stage1_input)
        current_obs_2nd, action_2nd, reward_2nd = stage2_input[:3]
        d_tm1_2nd = self._get_d_tm1(stage2_input)
        next_action_1st = self.policy(next_obs_1st)
        discount_1st = tf.expand_dims(discount_1st, axis=1)
        d_tm1_1st = tf.expand_dims(d_tm1_1st, axis=1)
        d_tm1_2nd = tf.expand_dims(d_tm1_2nd, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st, training=False)
        instrumental_feature_1st = d_tm1_1st * instrumental_feature_1st
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd, training=False)

        # target_1st = discount_1st * self.value_feature(obs=next_obs_1st, action=next_action_1st, training=False)
        target_1st = d_tm1_1st * discount_1st * add_const_col(
            self.value_feature(
                obs=next_obs_1st, action=next_action_1st, training=False))
        stage1_weight = fit_linear(target_1st, instrumental_feature_1st,
                                   self.stage1_reg)
        self.stage1_weight.assign(stage1_weight)
        predicted_feature = linear_reg_pred(instrumental_feature_2nd,
                                            stage1_weight)
        # current_feature = self.value_feature(obs=current_obs_2nd, action=action_2nd, training=False)
        current_feature = add_const_col(
            self.value_feature(obs=current_obs_2nd,
                               action=action_2nd,
                               training=False))
        # predicted_feature = add_const_col(current_feature) - self.discount * add_const_col(predicted_feature)
        predicted_feature = current_feature - d_tm1_2nd * self.discount * predicted_feature
        # self.value_func._weight.assign(
        #     fit_linear(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg))

        weight = d_tm1_2nd + (1.0 - d_tm1_2nd) * tf.convert_to_tensor(
            self.d_tm1_weight, dtype=tf.float32)
        stage2_weight = fit_linear(weight * tf.expand_dims(reward_2nd, -1),
                                   weight * predicted_feature, self.stage2_reg)
        self.value_func.weight.assign(stage2_weight)

        return stage1_weight, stage2_weight
Exemplo n.º 9
0
    def update_final_weight(self, stage1_input, stage2_input):
        current_obs_1st, action_1st, reward_1st, discount_1st, next_obs_1st = stage1_input.data[:
                                                                                                5]
        current_obs_2nd, action_2nd, reward_2nd, discount_2nd, next_obs_2nd = stage2_input.data[:
                                                                                                5]

        next_action_1st = self.policy(next_obs_1st)
        next_action_2nd = self.policy(next_obs_2nd)
        discount_1st = tf.expand_dims(discount_1st, axis=1)
        discount_2nd = tf.expand_dims(discount_2nd, axis=1)
        reward_1st = tf.expand_dims(reward_1st, axis=1)
        reward_2nd = tf.expand_dims(reward_2nd, axis=1)

        instrumental_feature_1st = self.instrumental_feature(
            obs=current_obs_1st, action=action_1st)
        instrumental_feature_2nd = self.instrumental_feature(
            obs=current_obs_2nd, action=action_2nd)

        target_1st = discount_1st * self.value_feature(obs=next_obs_1st,
                                                       action=next_action_1st)
        target_2nd = discount_2nd * self.value_feature(obs=next_obs_2nd,
                                                       action=next_action_2nd)

        # stage1_reg_candidate = [0.1, 0.01, 0.0001, 0.00001]
        stage1_reg_candidate = [
            0.1, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001, 0.00003,
            0.00001
        ]
        stage1_loss = np.inf
        stage1_weight = None
        self.stage1_reg = None
        for stage1_reg in stage1_reg_candidate:
            loss, weight = self.score_stage1_reg(stage1_reg,
                                                 instrumental_feature_1st,
                                                 instrumental_feature_2nd,
                                                 target_1st, target_2nd)
            if stage1_loss > loss:
                stage1_loss = loss
                stage1_weight = weight
                self.stage1_reg = stage1_reg

        # stage1_weight = fit_linear(target_1st, instrumental_feature_1st, self.stage1_reg)
        # stage1_loss = linear_reg_loss(target_1st, instrumental_feature_1st, self.stage1_reg)

        predicted_feature_1st = linear_reg_pred(instrumental_feature_1st,
                                                stage1_weight)
        current_feature_1st = self.value_feature(obs=current_obs_1st,
                                                 action=action_1st)
        predicted_feature_1st = current_feature_1st - self.discount * predicted_feature_1st

        predicted_feature_2nd = linear_reg_pred(instrumental_feature_2nd,
                                                stage1_weight)
        current_feature_2nd = self.value_feature(obs=current_obs_2nd,
                                                 action=action_2nd)
        predicted_feature_2nd = current_feature_2nd - self.discount * predicted_feature_2nd

        # predicted_feature = linear_reg_pred(instrumental_feature_2nd, stage1_weight)
        # current_feature = self.value_feature(obs=current_obs_2nd, action=action_2nd)
        # predicted_feature = current_feature - self.discount * predicted_feature

        stage2_reg_candidate = [
            0.1, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001, 0.00003,
            0.00001
        ]
        stage2_loss = np.inf
        stage2_weight = None
        self.stage2_reg = None
        for stage2_reg in stage2_reg_candidate:
            loss, weight = self.score_stage2_reg(stage2_reg,
                                                 predicted_feature_1st,
                                                 predicted_feature_2nd,
                                                 reward_1st, reward_2nd)
            if stage2_loss > loss:
                stage2_loss = loss
                stage2_weight = weight
                self.stage2_reg = stage2_reg

        # self.value_func._weight.assign(
        #     fit_linear(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg))
        # stage2_loss = linear_reg_loss(tf.expand_dims(reward_2nd, -1), predicted_feature, self.stage2_reg)

        self.value_func._weight.assign(stage2_weight)
        return stage1_loss, stage2_loss, self.stage1_reg, self.stage2_reg