def _build_net(self): self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr') self.cvr = tf.placeholder(tf.float32, [None, ], name='r') self.s = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s_') self.r = tf.placeholder(tf.float32, [None, ], name='r') self.a = tf.placeholder(tf.float32, [None, ], name='a') self.gamma = 1. self.done = tf.placeholder(tf.float32, [None, ], name='done') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net(self.s_, variable_scope="actor_target_net") self.critic_eval = self._build_q_net(self.s, self.a, variable_scope="eval_q_net") self.critic_eval_for_loss = self._build_q_net(self.s, self.a_eval, variable_scope="eval_q_net", reuse=True) self.critic_target = self._build_q_net(self.s_, self.a, variable_scope="target_q_net") t_gmv_params = scope_vars(absolute_scope_name("target_q_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_q_net")) ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)]) self.target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)]) with tf.variable_scope('soft_update'): self.a_update_target_q = self.__make_update_exp__(ae_params, at_params) self.update_target_q = self.__make_update_exp__(e_gmv_params, t_gmv_params) with tf.variable_scope('q_target'): self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target) self.montecarlo_target = self.return_value with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr)) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params) self._train_ddpg_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=e_gmv_params) self._train_ddpg_a_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, var_list=ae_params)
def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.current_step_pctrs = tf.placeholder(tf.float32, [None], name='pctr') self.probability_of_not_buying = tf.placeholder(tf.float32, [None], name='probability_of_not_buying') self.a = tf.placeholder(tf.int32, [None, ], name='a') self.done = tf.placeholder(tf.float32, [None, ], name='done') self.gmv_path_value = tf.placeholder(tf.float32, [None, ], name='gmv_path_value') self.restcost_value = tf.placeholder(tf.float32, [None, ], name='restcost_value') self.direct_cost_value = tf.placeholder(tf.float32, [None, ], name='restcost_value') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr") self.bid_max_ph = tf.placeholder(tf.float32, [None, ], name='bid_max') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.gmv_path_net = self._build_q_net(self.s, 2, variable_scope="gmv_net") self.cost_path_net = self._build_q_net(self.s, 2, variable_scope="cost_net") gmv_params = scope_vars(absolute_scope_name("gmv_net")) cost_params = scope_vars(absolute_scope_name("cost_net")) with tf.variable_scope('q_eval'): a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) self.gmv_sa = tf.gather_nd(params=self.gmv_path_net, indices=a_indices) self.cost_sa = tf.gather_nd(params=self.cost_path_net, indices=a_indices) self.q_sa = self.gmv_sa - self.roi_thr * ((1 - self.done) * self.cost_sa + self.direct_cost_value) with tf.variable_scope('loss'): self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize(self.gmv_loss, var_list=gmv_params) self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize(self.cost_loss, var_list=cost_params) self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=gmv_params + cost_params) with tf.variable_scope('action'): profit_a_1 = (self.gmv_path_net[:, 1] - self.roi_thr * self.cost_path_net[:, 1]) profit_a_0 = (self.gmv_path_net[:, 0] - self.roi_thr * self.cost_path_net[:, 0]) roi_thr_times_ecpm_diff = profit_a_1 - profit_a_0 positive_roi_thr_times_ecpm_diff = tf.maximum(roi_thr_times_ecpm_diff, 0, name="positive_diff") self.optimal_bid = positive_roi_thr_times_ecpm_diff / ( self.roi_thr * self.current_step_pctrs * self.probability_of_not_buying + 1e-10) + 0.01 with tf.variable_scope('roi'): roi_min_action_1 = self.gmv_path_net[:, 1] / (self.cost_path_net[:, 1] + self.bid_max_ph + 1e-10) roi_action_0 = self.gmv_path_net[:, 0] / (self.cost_path_net[:, 0] + 1e-10) self.max_longterm_roi = tf.maximum(roi_min_action_1, roi_action_0)
def _build_net(self): self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr') self.cvr = tf.placeholder(tf.float32, [None, ], name='r') self.s = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s_') self.r = tf.placeholder(tf.float32, [None, ], name='r') self.a = tf.placeholder(tf.int32, [None, ], name='a') self.adv = tf.placeholder(tf.float32, [None, ], name='advantage') self.gamma = 1. self.done = tf.placeholder(tf.float32, [None, ], name='done') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net") self.critic = self._build_q_net(self.s, variable_scope="eval_q_net") ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_q_net")) cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)]) with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr)) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params) self._train_ppo_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss) self._train_ppo_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss)
def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.cvr = tf.placeholder(tf.float32, [ None, ], name='r') self.cvr_net = self._build_cvr_net(self.s, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean( tf.squared_difference(self.predicted_cvr, self.cvr)) with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.cvr_loss, var_list=cvr_params)
def _build_net(self): self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr') self.cvr = tf.placeholder(tf.float32, [ None, ], name='r') self.s = tf.placeholder(tf.float32, [None, self.dqn_n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.dqn_n_features], name='s_') self.r = tf.placeholder(tf.float32, [ None, ], name='r') self.a = tf.placeholder(tf.int32, [ None, ], name='a') self.gamma = 1. self.done = tf.placeholder(tf.float32, [ None, ], name='done') self.return_value = tf.placeholder(tf.float32, [ None, ], name='return') self.important_sampling_weight_ph = tf.placeholder( tf.float32, [None], name="important_sampling_weight") self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] self.q_eval = self._build_q_net(self.s, self.n_actions, variable_scope="eval_q_net") self.q_next = self._build_q_net(self.s_, self.n_actions, variable_scope="target_q_net") t_gmv_params = scope_vars(absolute_scope_name("target_q_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_q_net")) cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('hard_replacement'): self.target_replace_op = tf.group( [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)]) with tf.variable_scope('soft_update'): self.update_target_q = self.__make_update_exp__( e_gmv_params, t_gmv_params) with tf.variable_scope('q_target'): target_q_sa = tf.reduce_max(self.q_next, axis=-1, name="target_q_sa") self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * target_q_sa) target_action = tf.argmax(self.q_eval, axis=-1, name="doubeldqn_argmax_action", output_type=tf.int32) target_a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), target_action ], axis=1) ddqn_target_q_sa = tf.gather_nd(params=self.q_next, indices=target_a_indices) self.double_dqn_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * ddqn_target_q_sa) self.montecarlo_target = self.return_value with tf.variable_scope('q_eval'): a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), self.a ], axis=1) self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean( tf.squared_difference(self.predicted_cvr, self.cvr)) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize( self.cvr_loss, var_list=cvr_params) self._train_dqn_op = tf.train.AdamOptimizer(self.lr).minimize( self.loss, var_list=e_gmv_params)
def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') self.r_gmv = tf.placeholder(tf.float32, [ None, ], name='r_gmv') self.r_cost = tf.placeholder(tf.float32, [ None, ], name='r_cost') self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr") self.r = tf.placeholder(tf.float32, [ None, ], name='r') self.a = tf.placeholder(tf.int32, [ None, ], name='a') self.done = tf.placeholder(tf.float32, [ None, ], name='done') self.return_gmv_value = tf.placeholder(tf.float32, [ None, ], name='return_gmv') self.return_cost_value = tf.placeholder(tf.float32, [ None, ], name='return_cost') self.return_value = tf.placeholder(tf.float32, [ None, ], name='return') self.important_sampling_weight_ph = tf.placeholder( tf.float32, [None], name="important_sampling_weight") self.q_eval_gmv = self._build_q_net(self.s, self.n_actions, variable_scope="eval_gmv_net") self.q_next_gmv = self._build_q_net(self.s_, self.n_actions, variable_scope="target_gmv_net") self.q_eval_cost = self._build_q_net(self.s, self.n_actions, variable_scope="eval_cost_net") self.q_next_cost = self._build_q_net(self.s_, self.n_actions, variable_scope="target_cost_net") self.q_eval = self.q_eval_gmv - self.roi_thr * self.q_eval_cost self.q_next = self.q_next_gmv - self.roi_thr * self.q_next_cost t_gmv_params = scope_vars(absolute_scope_name("target_gmv_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_gmv_net")) t_cost_params = scope_vars(absolute_scope_name("target_cost_net")) e_cost_params = scope_vars(absolute_scope_name("eval_cost_net")) with tf.variable_scope('hard_replacement'): self.target_gmv_replace_op = tf.group( [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)]) self.target_cost_replace_op = tf.group([ tf.assign(t, e) for t, e in zip(t_cost_params, e_cost_params) ]) with tf.variable_scope('soft_update'): self.update_gmv_target_q = self.__make_update_exp__( e_gmv_params, t_gmv_params) self.update_cost_target_q = self.__make_update_exp__( e_cost_params, t_cost_params) with tf.variable_scope('q_target'): greedy_action_s_ = tf.argmax(self.q_next, axis=-1, name="td0_argmax_action", output_type=tf.int32) greedy_a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), greedy_action_s_ ], axis=1) target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv, indices=greedy_a_indices) target_q_cost_sa = tf.gather_nd(params=self.q_next_cost, indices=greedy_a_indices) target_q_sa = tf.gather_nd(params=self.q_next, indices=greedy_a_indices) self.td0_q_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma * (1. - self.done) * target_q_gmv_sa) self.td0_q_cost_target = tf.stop_gradient(self.r_cost + self.gamma * (1. - self.done) * target_q_cost_sa) self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * target_q_sa) target_action = tf.argmax(self.q_eval, axis=-1, name="doubeldqn_argmax_action", output_type=tf.int32) target_a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), target_action ], axis=1) ddqn_target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv, indices=target_a_indices) ddqn_target_q_cost_sa = tf.gather_nd(params=self.q_next_cost, indices=target_a_indices) ddqn_target_q_sa = tf.gather_nd(params=self.q_next, indices=target_a_indices) self.double_dqn_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma * (1. - self.done) * ddqn_target_q_gmv_sa) self.double_dqn_cost_target = tf.stop_gradient( self.r_cost + self.gamma * (1. - self.done) * ddqn_target_q_cost_sa) self.double_dqn_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * ddqn_target_q_sa) self.montecarlo_gmv_target = self.return_gmv_value self.montecarlo_cost_target = self.return_cost_value self.montecarlo_target = self.return_value with tf.variable_scope('q_eval'): a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), self.a ], axis=1) self.q_eval_gmv_wrt_a = tf.gather_nd(params=self.q_eval_gmv, indices=a_indices) self.q_eval_cost_wrt_a = tf.gather_nd(params=self.q_eval_cost, indices=a_indices) self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) with tf.variable_scope('loss'): self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.loss, var_list=e_gmv_params + e_cost_params) self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize( self.gmv_loss, var_list=e_gmv_params) self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize( self.cost_loss, var_list=e_cost_params) with tf.variable_scope('roi'): greedy_action_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), self.a ], axis=1) self.plongterm_roi = tf.gather_nd( params=self.q_eval_gmv, indices=greedy_action_indices) / ( tf.gather_nd(params=self.q_eval_cost, indices=greedy_action_indices) + 1e-6)