示例#1
0
    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.float32, [None, ], name='a')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s_, variable_scope="actor_target_net")
        self.critic_eval = self._build_q_net(self.s, self.a, variable_scope="eval_q_net")
        self.critic_eval_for_loss = self._build_q_net(self.s, self.a_eval, variable_scope="eval_q_net",
                                                      reuse=True)
        self.critic_target = self._build_q_net(self.s_, self.a, variable_scope="target_q_net")

        t_gmv_params = scope_vars(absolute_scope_name("target_q_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])
            self.target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])

        with tf.variable_scope('soft_update'):
            self.a_update_target_q = self.__make_update_exp__(ae_params, at_params)
            self.update_target_q = self.__make_update_exp__(e_gmv_params, t_gmv_params)

        with tf.variable_scope('q_target'):
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target)

            self.montecarlo_target = self.return_value

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ddpg_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=e_gmv_params)
            self._train_ddpg_a_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, var_list=ae_params)
示例#2
0
    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.current_step_pctrs = tf.placeholder(tf.float32, [None], name='pctr')
        self.probability_of_not_buying = tf.placeholder(tf.float32, [None],
                                                        name='probability_of_not_buying')
        self.a = tf.placeholder(tf.int32, [None, ], name='a')
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.gmv_path_value = tf.placeholder(tf.float32, [None, ], name='gmv_path_value')
        self.restcost_value = tf.placeholder(tf.float32, [None, ], name='restcost_value')
        self.direct_cost_value = tf.placeholder(tf.float32, [None, ], name='restcost_value')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr")
        self.bid_max_ph = tf.placeholder(tf.float32, [None, ], name='bid_max')

        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.gmv_path_net = self._build_q_net(self.s, 2, variable_scope="gmv_net")
        self.cost_path_net = self._build_q_net(self.s, 2, variable_scope="cost_net")

        gmv_params = scope_vars(absolute_scope_name("gmv_net"))
        cost_params = scope_vars(absolute_scope_name("cost_net"))

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            self.gmv_sa = tf.gather_nd(params=self.gmv_path_net, indices=a_indices)
            self.cost_sa = tf.gather_nd(params=self.cost_path_net, indices=a_indices)
            self.q_sa = self.gmv_sa - self.roi_thr * ((1 - self.done) * self.cost_sa + self.direct_cost_value)

        with tf.variable_scope('loss'):
            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize(self.gmv_loss, var_list=gmv_params)
            self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize(self.cost_loss, var_list=cost_params)
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=gmv_params + cost_params)

        with tf.variable_scope('action'):
            profit_a_1 = (self.gmv_path_net[:, 1] - self.roi_thr * self.cost_path_net[:, 1])
            profit_a_0 = (self.gmv_path_net[:, 0] - self.roi_thr * self.cost_path_net[:, 0])
            roi_thr_times_ecpm_diff = profit_a_1 - profit_a_0
            positive_roi_thr_times_ecpm_diff = tf.maximum(roi_thr_times_ecpm_diff, 0, name="positive_diff")
            self.optimal_bid = positive_roi_thr_times_ecpm_diff / (
                    self.roi_thr * self.current_step_pctrs * self.probability_of_not_buying + 1e-10) + 0.01

        with tf.variable_scope('roi'):
            roi_min_action_1 = self.gmv_path_net[:, 1] / (self.cost_path_net[:, 1] + self.bid_max_ph + 1e-10)
            roi_action_0 = self.gmv_path_net[:, 0] / (self.cost_path_net[:, 0] + 1e-10)
            self.max_longterm_roi = tf.maximum(roi_min_action_1, roi_action_0)
示例#3
0
    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.int32, [None, ], name='a')
        self.adv = tf.placeholder(tf.float32, [None, ], name='advantage')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net")
        self.critic = self._build_q_net(self.s, variable_scope="eval_q_net")

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))
        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ppo_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss)
            self._train_ppo_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss)
    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.cvr = tf.placeholder(tf.float32, [
            None,
        ], name='r')

        self.cvr_net = self._build_cvr_net(self.s, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(
                tf.squared_difference(self.predicted_cvr, self.cvr))

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cvr_loss, var_list=cvr_params)
示例#5
0
    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features],
                                    name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [
            None,
        ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.dqn_n_features],
                                name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.dqn_n_features],
                                 name='s_')
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')
        self.a = tf.placeholder(tf.int32, [
            None,
        ], name='a')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [
            None,
        ], name='done')
        self.return_value = tf.placeholder(tf.float32, [
            None,
        ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(
            tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr,
                                           variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.q_eval = self._build_q_net(self.s,
                                        self.n_actions,
                                        variable_scope="eval_q_net")
        self.q_next = self._build_q_net(self.s_,
                                        self.n_actions,
                                        variable_scope="target_q_net")

        t_gmv_params = scope_vars(absolute_scope_name("target_q_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))
        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.target_replace_op = tf.group(
                [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])

        with tf.variable_scope('soft_update'):
            self.update_target_q = self.__make_update_exp__(
                e_gmv_params, t_gmv_params)

        with tf.variable_scope('q_target'):
            target_q_sa = tf.reduce_max(self.q_next,
                                        axis=-1,
                                        name="target_q_sa")
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma *
                                                 (1. - self.done) *
                                                 target_q_sa)

            target_action = tf.argmax(self.q_eval,
                                      axis=-1,
                                      name="doubeldqn_argmax_action",
                                      output_type=tf.int32)
            target_a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), target_action
            ],
                                        axis=1)
            ddqn_target_q_sa = tf.gather_nd(params=self.q_next,
                                            indices=target_a_indices)
            self.double_dqn_target = tf.stop_gradient(self.r + self.gamma *
                                                      (1. - self.done) *
                                                      ddqn_target_q_sa)

            self.montecarlo_target = self.return_value

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), self.a
            ],
                                 axis=1)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval,
                                             indices=a_indices)

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(
                tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cvr_loss, var_list=cvr_params)
            self._train_dqn_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.loss, var_list=e_gmv_params)
示例#6
0
    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features],
                                 name='s_')
        self.r_gmv = tf.placeholder(tf.float32, [
            None,
        ], name='r_gmv')
        self.r_cost = tf.placeholder(tf.float32, [
            None,
        ], name='r_cost')
        self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr")
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')
        self.a = tf.placeholder(tf.int32, [
            None,
        ], name='a')
        self.done = tf.placeholder(tf.float32, [
            None,
        ], name='done')
        self.return_gmv_value = tf.placeholder(tf.float32, [
            None,
        ],
                                               name='return_gmv')
        self.return_cost_value = tf.placeholder(tf.float32, [
            None,
        ],
                                                name='return_cost')
        self.return_value = tf.placeholder(tf.float32, [
            None,
        ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(
            tf.float32, [None], name="important_sampling_weight")

        self.q_eval_gmv = self._build_q_net(self.s,
                                            self.n_actions,
                                            variable_scope="eval_gmv_net")
        self.q_next_gmv = self._build_q_net(self.s_,
                                            self.n_actions,
                                            variable_scope="target_gmv_net")
        self.q_eval_cost = self._build_q_net(self.s,
                                             self.n_actions,
                                             variable_scope="eval_cost_net")
        self.q_next_cost = self._build_q_net(self.s_,
                                             self.n_actions,
                                             variable_scope="target_cost_net")
        self.q_eval = self.q_eval_gmv - self.roi_thr * self.q_eval_cost
        self.q_next = self.q_next_gmv - self.roi_thr * self.q_next_cost

        t_gmv_params = scope_vars(absolute_scope_name("target_gmv_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_gmv_net"))
        t_cost_params = scope_vars(absolute_scope_name("target_cost_net"))
        e_cost_params = scope_vars(absolute_scope_name("eval_cost_net"))

        with tf.variable_scope('hard_replacement'):
            self.target_gmv_replace_op = tf.group(
                [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])
            self.target_cost_replace_op = tf.group([
                tf.assign(t, e) for t, e in zip(t_cost_params, e_cost_params)
            ])

        with tf.variable_scope('soft_update'):
            self.update_gmv_target_q = self.__make_update_exp__(
                e_gmv_params, t_gmv_params)
            self.update_cost_target_q = self.__make_update_exp__(
                e_cost_params, t_cost_params)

        with tf.variable_scope('q_target'):
            greedy_action_s_ = tf.argmax(self.q_next,
                                         axis=-1,
                                         name="td0_argmax_action",
                                         output_type=tf.int32)
            greedy_a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), greedy_action_s_
            ],
                                        axis=1)
            target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv,
                                           indices=greedy_a_indices)
            target_q_cost_sa = tf.gather_nd(params=self.q_next_cost,
                                            indices=greedy_a_indices)
            target_q_sa = tf.gather_nd(params=self.q_next,
                                       indices=greedy_a_indices)
            self.td0_q_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma *
                                                     (1. - self.done) *
                                                     target_q_gmv_sa)
            self.td0_q_cost_target = tf.stop_gradient(self.r_cost +
                                                      self.gamma *
                                                      (1. - self.done) *
                                                      target_q_cost_sa)
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma *
                                                 (1. - self.done) *
                                                 target_q_sa)

            target_action = tf.argmax(self.q_eval,
                                      axis=-1,
                                      name="doubeldqn_argmax_action",
                                      output_type=tf.int32)
            target_a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), target_action
            ],
                                        axis=1)
            ddqn_target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv,
                                                indices=target_a_indices)
            ddqn_target_q_cost_sa = tf.gather_nd(params=self.q_next_cost,
                                                 indices=target_a_indices)
            ddqn_target_q_sa = tf.gather_nd(params=self.q_next,
                                            indices=target_a_indices)
            self.double_dqn_gmv_target = tf.stop_gradient(self.r_gmv +
                                                          self.gamma *
                                                          (1. - self.done) *
                                                          ddqn_target_q_gmv_sa)
            self.double_dqn_cost_target = tf.stop_gradient(
                self.r_cost + self.gamma *
                (1. - self.done) * ddqn_target_q_cost_sa)
            self.double_dqn_target = tf.stop_gradient(self.r + self.gamma *
                                                      (1. - self.done) *
                                                      ddqn_target_q_sa)

            self.montecarlo_gmv_target = self.return_gmv_value
            self.montecarlo_cost_target = self.return_cost_value
            self.montecarlo_target = self.return_value

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), self.a
            ],
                                 axis=1)
            self.q_eval_gmv_wrt_a = tf.gather_nd(params=self.q_eval_gmv,
                                                 indices=a_indices)
            self.q_eval_cost_wrt_a = tf.gather_nd(params=self.q_eval_cost,
                                                  indices=a_indices)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval,
                                             indices=a_indices)

        with tf.variable_scope('loss'):
            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.loss, var_list=e_gmv_params + e_cost_params)
            self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.gmv_loss, var_list=e_gmv_params)
            self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cost_loss, var_list=e_cost_params)

        with tf.variable_scope('roi'):
            greedy_action_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), self.a
            ],
                                             axis=1)
            self.plongterm_roi = tf.gather_nd(
                params=self.q_eval_gmv, indices=greedy_action_indices) / (
                    tf.gather_nd(params=self.q_eval_cost,
                                 indices=greedy_action_indices) + 1e-6)