def __init__( self, user_num, n_features, init_roi, budget, use_budget_control, max_trajectory_length, update_times_per_train=1, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1) self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = 1 self.n_features = n_features self.lr = 0.001 self.scope_name = "MyopicGreedy-model" self.epoch = 0 self.buffer_size = 1000 * max_trajectory_length self.batch_size = 512 self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=False) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
def __init__(self, render=False, method='Duel'): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.render = render if render: self.env = gym.make('NEL-render-v0') else: self.env = gym.make('NEL-v0') #self.test_env = gym.make('NEL-v0') self.an = self.env.action_space.n # No. of actions in env self.epsilon = 0.5 self.training_time = PARAM.TRAINING_TIME # Training Time self.df = PARAM.DISCOUNT_FACTOR # Discount Factor self.batch_size = PARAM.BATCH_SIZE self.method = method self.test_curr_state = None self.log_time = 100.0 self.test_time = 1000.0 self.prioritized_replay = PARAM.PRIORITIZED_REPLAY self.prioritized_replay_eps = 1e-6 #self.prioritized_replay_alpha = 0.6 self.prioritized_replay_alpha = 0.8 self.prioritized_replay_beta0 = 0.4 self.burn_in = PARAM.BURN_IN # Create Replay Memory and initialize with burn_in transitions if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha) self.beta_schedule = LinearSchedule( float(self.training_time), initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) self.beta_schedule = None # Create QNetwork instance if self.method == 'Duel': print('Using Duel Network.') self.net = DuelQNetwork(self.an) elif self.method == 'DoubleQ': print('Using DoubleQ Network.') self.net = DoubleQNetwork(self.an) else: raise NotImplementedError cur_dir = os.getcwd() self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime( "%Y%m%d-%H%M%S") + '/' # Create output directory if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w') self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')
def __init__(self, user_num, action_dim, action_bound, cvr_n_features, ddpg_n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, use_predict_cvr=False): self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.action_dim = action_dim self.action_bound = action_bound self.n_actions = 1 self.cvr_n_features = cvr_n_features self.ddpg_n_features = ddpg_n_features self.lr = 0.001 self.use_predict_cvr = use_predict_cvr self.user_based_adjust_times = 40 self.epsilon = 0.9 self.epsilon_min = 0.05 self.epsilon_dec = 0.3 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "CDDPG-model" self.epoch = 0 self.exploration_noise = OUNoise(self.action_dim) self.cvr_buffer_size = 1000 * max_trajectory_length self.cvr_batch_size = 512 self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False) self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay self.ddpg_buffer_size = 1000 * max_trajectory_length self.ddpg_batch_size = 256 if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ddpg_buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.ddpg_buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
def __init__(self, gamma, action_number, minibatch, episodes, begin_train, train_step, begin_copy, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, episode_steps, episode_to_save, max_buffer_len): # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch self.action_number = action_number self.gamma = gamma # Episode Params self.begin_train = begin_train self.begin_copy = begin_copy self.copy_step = copy_step self.train_step = train_step self.episodes = episodes self.episode_steps = episode_steps self.episode_to_save = episode_to_save # I/O params self.path_to_load = path_to_load self.path_to_save = path_to_save self.load_model = load_model # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # self.device = torch.device('cpu') self.model = BoxModel((150, 100, 1), action_number).to(self.device) if self.load_model: self.model.load_state_dict(torch.load(self.path_to_load)) # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], []
def __init__(self, render=False): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.render = render if render: self.env = gym.make('NEL-render-v0') else: self.env = gym.make('NEL-v0') #self.test_env = gym.make('NEL-v0') self.an = self.env.action_space.n # No. of actions in env self.training_time = PARAM.TRAINING_TIME # Training Time self.method = 'PPO' self.test_curr_state = None self.log_time = 100.0 self.test_time = 1000.0 self.burn_in = PARAM.BURN_IN self.tmax = PARAM.A2C_EPISODE_SIZE_MAX self.tmin = PARAM.A2C_EPISODE_SIZE_MIN self.seq_len = PARAM.A2C_SEQUENCE_LENGTH self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) self.episode_buffer = [[]] * self.tmax self.net = PPO(self.episode_buffer, self.replay_buffer) cur_dir = os.getcwd() self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime( "%Y%m%d-%H%M%S") + '/' # Create output directory if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w') self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w') self.curr_state = self.env.reset() self.tong_count = 0 self.curr_state = self.burn_in_memory(self.curr_state) self.train_rewards = [] self.test_rewards = [] self.steps = 0 self.cum_reward = 0.0 self.save_count = 0
def __init__( self, n_actions=11, n_features=29, use_prioritized_experience_replay=True, max_trajectory_length=20, ): self.n_actions = n_actions self.n_features = n_features self.gamma = 1. self.lr = 0.001 self.epsilon = 0.5 self.epsilon_min = 0 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 1000 self.replace_target_iter = 100 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "DQN-model" self.epoch = 0 self.buffer_size = 5000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.margin_constant = 2 with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
def __init__(self, user_num, n_actions, cvr_n_features, ppo_n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, use_predict_cvr=False): self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = n_actions self.action_dim = 1 self.cvr_n_features = cvr_n_features self.ppo_n_features = ppo_n_features self.lr = 0.001 self.use_predict_cvr = use_predict_cvr self.user_based_adjust_times = 40 self.epsilon = 0.4 self.epsilon_min = 0.05 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.epsilon_clip = 0.2 self.lam = 0.5 self.update_step = 1 self.kl_target = 0.01 self.gamma = 1. self.method = 'clip' self.policy_logvar = 1e-7 self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "CPPO-model" self.epoch = 0 self.cvr_buffer_size = 1000 * max_trajectory_length self.cvr_batch_size = 512 self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False) self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay self.ppo_buffer_size = 1000 * max_trajectory_length self.ppo_batch_size = 250 if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ppo_buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.ppo_buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
class ConstrainedPPO(CMDPAgent): def init_parameters(self, sess): if self.has_target_net: super(CMDPAgent, self).init_parameters(sess) sess.run(self.a_target_replace_op) def __init__(self, user_num, n_actions, cvr_n_features, ppo_n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, use_predict_cvr=False): self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = n_actions self.action_dim = 1 self.cvr_n_features = cvr_n_features self.ppo_n_features = ppo_n_features self.lr = 0.001 self.use_predict_cvr = use_predict_cvr self.user_based_adjust_times = 40 self.epsilon = 0.4 self.epsilon_min = 0.05 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.epsilon_clip = 0.2 self.lam = 0.5 self.update_step = 1 self.kl_target = 0.01 self.gamma = 1. self.method = 'clip' self.policy_logvar = 1e-7 self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "CPPO-model" self.epoch = 0 self.cvr_buffer_size = 1000 * max_trajectory_length self.cvr_batch_size = 512 self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False) self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay self.ppo_buffer_size = 1000 * max_trajectory_length self.ppo_batch_size = 250 if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ppo_buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.ppo_buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_cvr_net(self, state, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr', kernel_initializer=initializers.xavier_initializer())) return cvr_out def _build_action_net(self, state, variable_scope): with tf.variable_scope(variable_scope): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) a_prob = tf.layers.dense(fc3, self.n_actions, tf.nn.softmax, kernel_initializer=initializers.xavier_initializer()) return a_prob def _build_q_net(self, state, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) v = tf.layers.dense(fc3, 1, kernel_initializer=initializers.xavier_initializer()) return v[:, 0] def __make_update_exp__(self, vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return expression def _build_net(self): self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr') self.cvr = tf.placeholder(tf.float32, [None, ], name='r') self.s = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s_') self.r = tf.placeholder(tf.float32, [None, ], name='r') self.a = tf.placeholder(tf.int32, [None, ], name='a') self.adv = tf.placeholder(tf.float32, [None, ], name='advantage') self.gamma = 1. self.done = tf.placeholder(tf.float32, [None, ], name='done') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net") self.critic = self._build_q_net(self.s, variable_scope="eval_q_net") ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_q_net")) cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)]) with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr)) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params) self._train_ppo_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss) self._train_ppo_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss) def _pick_loss(self): self.has_target_net = True self.critic_loss = self.closs self.actor_loss = self.aloss def _build_loss(self): with tf.variable_scope('critic'): self.c_loss = self.return_value - self.critic self.closs = tf.reduce_mean(tf.square(self.c_loss)) self.advantage = self.return_value - self.critic with tf.variable_scope('surrogate'): a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) pi_prob = tf.gather_nd(params=self.a_eval, indices=a_indices) oldpi_prob = tf.gather_nd(params=self.a_target, indices=a_indices) ratio = pi_prob / (oldpi_prob + 1e-8) surr = ratio * self.adv if self.method == 'kl_pen': kl = tf.distributions.kl_divergence(self.a_target, self.a_eval) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.lam * kl)) else: self.aloss = -tf.reduce_mean(tf.minimum( surr, tf.clip_by_value(ratio, 1. - self.epsilon_clip, 1. + self.epsilon_clip) * self.adv)) def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): cvr_trajectory = other_info["cvr"] for ele in cvr_trajectory: state, cvr = ele self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0) def experience_cmdp(self, new_trajectory, other_info=None): if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) def get_agent_name(self): return self.scope_name def get_action(self, sess, obs, is_test=False, other_info=None): item_price = other_info["proxy_ad_price"] ground_truth_cvr = other_info["cvr"] user_alpha = other_info["user_alpha"] roi_thr = other_info["roi_thr"] observations = obs[np.newaxis, :] cvr = sess.run(self.predicted_cvr, feed_dict={ self.s_cvr: observations })[0] if self.use_predict_cvr: bid = cvr * item_price / roi_thr else: bid = ground_truth_cvr * item_price / roi_thr return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]} def get_cmdp_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.__greedy__(sess, obs) else: discrete_action = self.__epsilon_greedy__(sess, obs) return discrete_action def __greedy__(self, sess, observation): s = observation[np.newaxis, :] prob_weights = sess.run(self.a_eval, feed_dict={self.s: s}) greedy_action = np.argmax(prob_weights, axis=1)[0] return greedy_action def __epsilon_greedy__(self, sess, observation): if np.random.uniform() < self.epsilon: action = np.random.randint(0, self.n_actions) else: action = self.__greedy__(sess, observation) return action def _is_exploration_enough(self, buffer, min_pool_size): return len(buffer) >= min_pool_size def train_cvr(self, sess): if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size): return False, [0, 0, 0] cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size) obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index( sample_indices) _, cvr_loss, predicted_cvrs = sess.run( [self._train_cvr_op, self.cvr_loss, self.predicted_cvr], feed_dict={ self.s_cvr: obs, self.cvr: cvr_targets } ) return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)] def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def update_target(self, sess): if self.epoch % self.replace_target_iter == 0: sess.run(self.a_target_replace_op) def train(self, sess): if self.has_target_net: self.update_target(sess) self.epoch += 1 buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer if not self._is_exploration_enough(buffer, self.ppo_batch_size): return False, [0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess) else: loss, montecarlo_loss, q_eval, returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) print("update epsilon:", self.epsilon) return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon def train_prioritized(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.prioritized_replay_buffer.make_index(self.ppo_batch_size) obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval, \ priority_values = sess.run( [self._train_ppo_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a, self.priority_values], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, self.important_sampling_weight_ph: weights, }) priorities = priority_values + 1e-6 self.prioritized_replay_buffer.update_priorities(sample_indices, priorities) return loss, montecarlo_loss, np.average(q_eval), np.average(returns) def train_normal(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_index(self.ppo_batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) adv = sess.run(self.advantage, {self.s: obs, self.return_value: returns}) _, montecarlo_loss, q_eval = sess.run( [self._train_ppo_critic_op, self.critic_loss, self.critic], feed_dict={ self.s: obs, self.a: act, self.adv: adv, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, }) if self.method == 'kl_pen': for _ in range(self.update_step): _, kl, loss = sess.run( [self._train_ppo_actor_op, self.kl_mean, self.actor_loss], feed_dict={ self.adv: adv, self.s: obs, self.a: act, self.r: rew, self.done: done, }) if kl > 4 * self.kl_target: break if kl < self.kl_target / 1.5: self.lam /= 2 elif kl > self.kl_target * 1.5: self.lam *= 2 self.lam = np.clip(self.lam, 1e-4, 10) else: for _ in range(self.update_step): _, loss = sess.run( [self._train_ppo_actor_op, self.actor_loss], feed_dict={ self.adv: adv, self.s: obs, self.a: act, self.r: rew, self.done: done, self.return_value: returns, }) return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
class Agent_ppo(): def __init__(self, render=False): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.render = render if render: self.env = gym.make('NEL-render-v0') else: self.env = gym.make('NEL-v0') #self.test_env = gym.make('NEL-v0') self.an = self.env.action_space.n # No. of actions in env self.training_time = PARAM.TRAINING_TIME # Training Time self.method = 'PPO' self.test_curr_state = None self.log_time = 100.0 self.test_time = 1000.0 self.burn_in = PARAM.BURN_IN self.tmax = PARAM.A2C_EPISODE_SIZE_MAX self.tmin = PARAM.A2C_EPISODE_SIZE_MIN self.seq_len = PARAM.A2C_SEQUENCE_LENGTH self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) self.episode_buffer = [[]] * self.tmax self.net = PPO(self.episode_buffer, self.replay_buffer) cur_dir = os.getcwd() self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime( "%Y%m%d-%H%M%S") + '/' # Create output directory if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w') self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w') self.curr_state = self.env.reset() self.tong_count = 0 self.curr_state = self.burn_in_memory(self.curr_state) self.train_rewards = [] self.test_rewards = [] self.steps = 0 self.cum_reward = 0.0 self.save_count = 0 def generate_episode(self, tmax, render=False): #for i in range(tmax): ctr, i = (0, 0) self.her_reward_buffer = np.zeros(tmax) her_reward = 0 while ctr < tmax: if i % PARAM.ACTION_REPEAT == 0: val, softmax, action = self.net.get_output( [ctr - 1], seq_len=self.seq_len, batch_size=1) else: action = 0 next_state, reward, _, _ = self.env.step(action) if render: self.env.render() if PARAM.REWARD_SHAPING: psuedo_reward = self.compute_psuedo_reward( next_state['vision']) else: psuedo_reward = 0.0 tong_reward = 0.0 if reward == 0: if self.curr_state['vision'][5, 6, 0] == 1.0: self.tong_count += 1 if PARAM.REWARD_SHAPING: tong_reward = 10.0 elif reward == 100.0: self.tong_count -= 1 her_reward += reward if i % PARAM.ACTION_REPEAT == 0: self.episode_buffer[ctr] = (self.curr_state, action, ((reward + tong_reward) / 100.0 + psuedo_reward), next_state, softmax, self.tong_count, val) self.her_reward_buffer[ctr] = her_reward her_reward = 0 ctr += 1 self.replay_buffer.add(self.curr_state, action, reward / 100.0, next_state, 0, self.tong_count) self.curr_state = next_state i += 1 self.steps += 1 self.cum_reward += reward if self.steps % 100 == 0: self.plot_train_stats() def compute_psuedo_reward(self, vision): avg = np.mean(vision[3:8, 3:8, :], axis=2) idxs = avg == 0.5 avg[idxs] = 0.0 reward = np.sum(avg) - 1.0 / 3.0 if reward < 0.001: return 0.0 return reward def hind_sight_experience_replay(self, episode_len): her_reward = 0 her_decay = PARAM.HER_DECAY for i in range(episode_len - 1, -1, -1): obs, action, reward, next_obs, softmax, tong_count, val = self.episode_buffer[ i] self.episode_buffer[i] = ( obs, action, (self.her_reward_buffer[i] + her_reward * her_decay) / 100.0, next_obs, softmax, tong_count, val) her_reward = her_reward * her_decay + self.her_reward_buffer[i] def train(self): for i in range(self.training_time): self.net.set_train() episode_len = np.random.randint(self.tmin, self.tmax + 1) self.generate_episode(episode_len, self.render) if PARAM.HER: self.hind_sight_experience_replay(episode_len) self.net.train(episode_len) self.save_count += 1 def test(self, testing_steps=100, model_file=None): if model_file is not None: self.net.load_model(model_file) self.net.set_eval() cum_reward = 0.0 for i in range(testing_steps): softmax, action = self.net.get_output(self.curr_state, i) _, reward, _, _ = self.test_env.step(action) cum_reward += reward self.test_reward.append(cum_reward) self.test_file.write(str(test_rewards[-1])) self.test_file.write('\n') self.test_file.flush() print('\nTest Reward: %.4f\n' % (test_rewards[-1])) test_steps = 0 x = list(range(len(test_rewards))) plt.plot(x, self.test_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Testing Curve') plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png') plt.close() def plot_train_stats(self): self.cum_reward = self.cum_reward / float(self.log_time) self.train_rewards.append(self.cum_reward) self.train_file.write(str(self.cum_reward)) self.train_file.write('\n') self.train_file.flush() self.cum_reward = 0.0 if self.train_rewards[-1] > 0: self.net.A.save("checkpoint.pth") print('[%d] Train Reward: %.4f' % (len(self.train_rewards), self.train_rewards[-1])) self.steps = 0 x = list(range(len(self.train_rewards))) plt.plot(x, self.train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, self.train_rewards) # if self.save_count > 0 and self.save_count % 500 == 0: # self.net.save_model_weights(self.save_count, self.dump_dir) def burn_in_memory(self, curr_state): # Initialize your replay memory with a burn_in number of episodes / transitions. cnt = 0 while self.burn_in > cnt: action = self.env.action_space.sample() next_state, reward, _, _ = self.env.step(action) if reward == 20.0: self.tong_count += 1 elif reward == 100.0: self.tong_count -= 1 self.replay_buffer.add(curr_state, action, reward / 100.0, next_state, 0, self.tong_count) curr_state = next_state cnt = cnt + 1 return curr_state
def __init__( self, user_num, n_actions, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=2) self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = n_actions self.n_features = n_features self.gamma = 1. self.lr = 0.001 self.user_based_adjust_times = 40 self.epsilon = 0.4 self.epsilon_min = 0.05 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "DQN-model" self.epoch = 0 self.buffer_size = 1000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.margin_constant = 2 with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
def __init__(self, gamma, action_number, minibatch, episodes, begin_train, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, plots_to_save, episode_steps, episode_to_save, max_buffer_len, model_type ): super().__init__(gamma=gamma, action_number=action_number, path_to_load=path_to_load, path_to_save=path_to_save, plots_to_save=plots_to_save, load_model=load_model, episode_to_save=episode_to_save, episodes=episodes, model_type=model_type) # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch # Episode Params self.begin_train = begin_train self.copy_step = copy_step self.episode_steps = episode_steps # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.target_model = model_type(action_number).to(self.device) self.update_target() # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], [] self.losses = [] self.periodic_reward = 0 self.periodic_rewards = []
class ConstrainedDDPG(CMDPAgent): def init_parameters(self, sess): if self.has_target_net: super(CMDPAgent, self).init_parameters(sess) sess.run(self.target_replace_op) sess.run(self.a_target_replace_op) def __init__(self, user_num, action_dim, action_bound, cvr_n_features, ddpg_n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, use_predict_cvr=False): self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.action_dim = action_dim self.action_bound = action_bound self.n_actions = 1 self.cvr_n_features = cvr_n_features self.ddpg_n_features = ddpg_n_features self.lr = 0.001 self.use_predict_cvr = use_predict_cvr self.user_based_adjust_times = 40 self.epsilon = 0.9 self.epsilon_min = 0.05 self.epsilon_dec = 0.3 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "CDDPG-model" self.epoch = 0 self.exploration_noise = OUNoise(self.action_dim) self.cvr_buffer_size = 1000 * max_trajectory_length self.cvr_batch_size = 512 self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False) self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay self.ddpg_buffer_size = 1000 * max_trajectory_length self.ddpg_batch_size = 256 if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ddpg_buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.ddpg_buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_cvr_net(self, state, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr', kernel_initializer=initializers.xavier_initializer())) return cvr_out def _build_q_net(self, state, action, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] state = tf.concat([state, tf.expand_dims(action, axis=1, name="2d-action")], axis=1) fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2') q = tf.layers.dense(fc2, units=self.action_dim, name='q') return q[:, 0] def _build_action_net(self, state, variable_scope): with tf.variable_scope(variable_scope): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2') actions = tf.layers.dense(fc2, self.action_dim, activation=tf.nn.sigmoid, name='a') return actions[:, 0] def __make_update_exp__(self, vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return expression def _build_net(self): self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr') self.cvr = tf.placeholder(tf.float32, [None, ], name='r') self.s = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s_') self.r = tf.placeholder(tf.float32, [None, ], name='r') self.a = tf.placeholder(tf.float32, [None, ], name='a') self.gamma = 1. self.done = tf.placeholder(tf.float32, [None, ], name='done') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net(self.s_, variable_scope="actor_target_net") self.critic_eval = self._build_q_net(self.s, self.a, variable_scope="eval_q_net") self.critic_eval_for_loss = self._build_q_net(self.s, self.a_eval, variable_scope="eval_q_net", reuse=True) self.critic_target = self._build_q_net(self.s_, self.a, variable_scope="target_q_net") t_gmv_params = scope_vars(absolute_scope_name("target_q_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_q_net")) ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)]) self.target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)]) with tf.variable_scope('soft_update'): self.a_update_target_q = self.__make_update_exp__(ae_params, at_params) self.update_target_q = self.__make_update_exp__(e_gmv_params, t_gmv_params) with tf.variable_scope('q_target'): self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target) self.montecarlo_target = self.return_value with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr)) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params) self._train_ddpg_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=e_gmv_params) self._train_ddpg_a_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, var_list=ae_params) def _pick_loss(self): self.has_target_net = True self.loss = self.ddpg_loss self.priority_values = self.td0_error self.actor_loss = self.a_loss def _build_loss(self): if self.use_prioritized_experience_replay: self.ddpg_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.td0_q_target, self.critic_eval, name='TD0_loss')) self.montecarlo_loss = tf.reduce_mean(self.important_sampling_weight_ph * tf.squared_difference(self.montecarlo_target, self.critic_eval, name='MonteCarlo_error')) else: self.ddpg_loss = tf.reduce_mean(tf.squared_difference(self.td0_q_target, self.critic_eval, name='TD0_loss')) self.montecarlo_loss = tf.reduce_mean(tf.squared_difference(self.montecarlo_target, self.critic_eval, name='MonteCarlo_error')) self.a_loss = - tf.reduce_mean(self.critic_eval_for_loss) self.td0_error = tf.abs(self.td0_q_target - self.critic_eval) self.montecarlo_error = tf.abs(self.montecarlo_target - self.critic_eval) def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): cvr_trajectory = other_info["cvr"] for ele in cvr_trajectory: state, cvr = ele self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0) def experience_cmdp(self, new_trajectory, other_info=None): if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) def get_agent_name(self): return self.scope_name def get_action(self, sess, obs, is_test=False, other_info=None): item_price = other_info["proxy_ad_price"] ground_truth_cvr = other_info["cvr"] user_alpha = other_info["user_alpha"] roi_thr = other_info["roi_thr"] observations = obs[np.newaxis, :] cvr = sess.run(self.predicted_cvr, feed_dict={ self.s_cvr: observations })[0] if self.use_predict_cvr: bid = cvr * item_price / roi_thr else: bid = ground_truth_cvr * item_price / roi_thr return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]} def get_cmdp_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.__greedy__(sess, obs) else: discrete_action = self.__epsilon_greedy__(sess, obs) return discrete_action def __greedy__(self, sess, observation): observation = observation[np.newaxis, :] greedy_action = sess.run(self.a_eval, feed_dict={self.s: observation}) return greedy_action[0] def __epsilon_greedy__(self, sess, observation): if np.random.uniform() < self.epsilon: observation = observation[np.newaxis, :] actions_value = sess.run(self.a_eval, feed_dict={self.s: observation}) action_noise = self.exploration_noise.noise() action = actions_value + action_noise action = action[0] else: action = self.__greedy__(sess, observation) return action def _is_exploration_enough(self, buffer, min_pool_size): return len(buffer) >= min_pool_size def train_cvr(self, sess): if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size): return False, [0, 0, 0] cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size) obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index( sample_indices) _, cvr_loss, predicted_cvrs = sess.run( [self._train_cvr_op, self.cvr_loss, self.predicted_cvr], feed_dict={ self.s_cvr: obs, self.cvr: cvr_targets } ) return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)] def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def update_target(self, sess): if self.softupdate: if self.epoch % self.soft_update_iter == 0: sess.run(self.update_target_q) sess.run(self.a_update_target_q) else: if self.epoch % self.replace_target_iter == 0: sess.run(self.target_replace_op) sess.run(self.a_target_replace_op) def train(self, sess): if self.has_target_net: self.update_target(sess) self.epoch += 1 buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer if not self._is_exploration_enough(buffer, self.ddpg_batch_size): return False, [0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess) else: loss, montecarlo_loss, q_eval, returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) print("update epsilon:", self.epsilon) return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon def train_prioritized(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.prioritized_replay_buffer.make_index(self.ddpg_batch_size) obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval, \ priority_values = sess.run( [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval, self.priority_values], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, self.important_sampling_weight_ph: weights, }) priorities = priority_values + 1e-6 self.prioritized_replay_buffer.update_priorities(sample_indices, priorities) return loss, montecarlo_loss, np.average(q_eval), np.average(returns) def train_normal(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_index(self.ddpg_batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval = sess.run( [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, }) _, actor_loss = sess.run( [self._train_ddpg_a_op, self.actor_loss], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, }) return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
def __init__( self, user_num, action_dim, action_bound, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=2) self.use_budget_control = use_budget_control self.user_num = user_num self.action_bound = action_bound self.action_dim = action_dim self.n_actions = 1 self.n_features = n_features self.gamma = 1. self.update_times_per_train = update_times_per_train self.lr = 0.001 self.epsilon = 0.9 self.epsilon_min = 0.1 self.epsilon_dec = 0.3 self.epsilon_dec_iter = 100 self.replace_target_iter = 300 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "DDPG-model" self.epoch = 0 self.exploration_noise = OUNoise(self.action_dim) self.noise_weight = 1 self.noise_descrement_per_sampling = 0.0001 self.buffer_size = 20000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
class ContextualBandit(PIDAgent, CvrAgent): def __init__( self, user_num, n_features, init_roi, budget, use_budget_control, max_trajectory_length, update_times_per_train=1, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1) self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = 1 self.n_features = n_features self.lr = 0.001 self.scope_name = "MyopicGreedy-model" self.epoch = 0 self.buffer_size = 1000 * max_trajectory_length self.batch_size = 512 self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=False) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_cvr_net(self, state, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1') cvr_out = tf.sigmoid(tf.layers.dense(fc1, units=1, name='cvr')) return cvr_out def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.cvr = tf.placeholder(tf.float32, [ None, ], name='r') self.cvr_net = self._build_cvr_net(self.s, variable_scope="cvr_net") self.predicted_cvr = self.cvr_net[:, 0] cvr_params = scope_vars(absolute_scope_name("cvr_net")) with tf.variable_scope('loss'): self.cvr_loss = tf.reduce_mean( tf.squared_difference(self.predicted_cvr, self.cvr)) with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.cvr_loss, var_list=cvr_params) def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): cvr_trajectory = other_info["cvr"] for ele in cvr_trajectory: state, cvr = ele self.replay_buffer.add(state, 0, cvr, state, 0, 0, 0) def get_action(self, sess, obs, is_test=False, other_info=None): item_price = other_info["proxy_ad_price"] ground_truth_cvr = other_info["cvr"] user_alpha = other_info["user_alpha"] if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi observations = obs[np.newaxis, :] cvr = sess.run(self.predicted_cvr, feed_dict={self.s: observations})[0] bid = ground_truth_cvr * item_price / roi_thr return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]} def _is_exploration_enough(self, min_pool_size): return len(self.replay_buffer) >= min_pool_size def train(self, sess): self.epoch += 1 if not self._is_exploration_enough(self.batch_size): return False, [0, 0, 0] cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_index(self.batch_size) obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) _, cvr_loss, predicted_cvrs = sess.run( [self._train_op, self.cvr_loss, self.predicted_cvr], feed_dict={ self.s: obs, self.cvr: cvr_targets }) return True, [ cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets) ]
class Agent(): def __init__(self, render=False, method='Duel'): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.render = render if render: self.env = gym.make('NEL-render-v0') else: self.env = gym.make('NEL-v0') #self.test_env = gym.make('NEL-v0') self.an = self.env.action_space.n # No. of actions in env self.epsilon = 0.5 self.training_time = PARAM.TRAINING_TIME # Training Time self.df = PARAM.DISCOUNT_FACTOR # Discount Factor self.batch_size = PARAM.BATCH_SIZE self.method = method self.test_curr_state = None self.log_time = 100.0 self.test_time = 1000.0 self.prioritized_replay = PARAM.PRIORITIZED_REPLAY self.prioritized_replay_eps = 1e-6 #self.prioritized_replay_alpha = 0.6 self.prioritized_replay_alpha = 0.8 self.prioritized_replay_beta0 = 0.4 self.burn_in = PARAM.BURN_IN # Create Replay Memory and initialize with burn_in transitions if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha) self.beta_schedule = LinearSchedule( float(self.training_time), initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) self.beta_schedule = None # Create QNetwork instance if self.method == 'Duel': print('Using Duel Network.') self.net = DuelQNetwork(self.an) elif self.method == 'DoubleQ': print('Using DoubleQ Network.') self.net = DoubleQNetwork(self.an) else: raise NotImplementedError cur_dir = os.getcwd() self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime( "%Y%m%d-%H%M%S") + '/' # Create output directory if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w') self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w') def update_epsilon(self): ''' Epsilon decay from 0.5 to 0.05 over 100000 iterations. ''' if self.epsilon <= 0.05: self.epsilon = 0.05 return self.epsilon = self.epsilon - (0.5 - 0.1) / 200000.0 def epsilon_greedy_policy(self, q_values, epsilon): # Creating epsilon greedy probabilities to sample from. val = np.random.rand(1) if val <= epsilon: return np.random.randint(q_values.shape[1]) return np.argmax(q_values) def greedy_policy(self, q_values): # Creating greedy policy for test time. return np.argmax(q_values) def train(self): train_rewards = [] test_rewards = [] count = 0 steps = 0 test_steps = 0 cum_reward = 0.0 elapsed = 0.0 curr_state = self.env.reset() curr_state = self.burn_in_memory(curr_state) prev_action = -1 if self.render: self.env.render() for i in range(self.training_time): # Get q_values based on the current state Vt, St = self.get_input_tensor(curr_state) q_values = self.net.get_Q_output(Vt, St) # Selecting an action based on the policy action = self.epsilon_greedy_policy(q_values, self.epsilon) #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1: # action = self.epsilon_greedy_policy(q_values, 0.5) # Executing action in simulator nextstate, reward, _, _ = self.env.step(action) steps = steps + 1 test_steps = test_steps + 1 if self.render: self.env.render() # Store Transition if nextstate['moved'] or prev_action != action: self.replay_buffer.add(curr_state, action, reward / 100.0, nextstate, 0) prev_action = action # Sample random minibatch from experience replay if self.prioritized_replay: batch, weights, batch_idxes = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(i)) else: batch = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones(self.batch_size), None # Train the Network with mini batches xVT, xST = self.get_input_tensors(batch) yT = self.get_output_tensors(batch) # Mask to select the actions from the Q network output mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8) for k, tran in enumerate(batch): mT[k, tran[1]] = 1 td_errors = self.net.train(xVT, xST, yT, mT, weights) if self.prioritized_replay: #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps #new_priorities = [] #for i, tran in enumerate(batch): # new_priorities.append(tran[2] + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_idxes, weights) # Decay epsilon self.update_epsilon() cum_reward += reward curr_state = nextstate if steps == 100: cum_reward = cum_reward / float(self.log_time) train_rewards.append(cum_reward) self.train_file.write(str(cum_reward)) self.train_file.write('\n') self.train_file.flush() cum_reward = 0.0 print('Train Reward: %.4f' % (train_rewards[-1])) steps = 0 x = list(range(len(train_rewards))) plt.plot(x, train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, train_rewards) # if test_steps == 500: # self.net.set_eval() # test_rewards.append(self.test()) # self.test_file.write(str(test_rewards[-1])) # self.test_file.write('\n') # self.test_file.flush() # self.net.set_train() # count = count + 1 # print('\nTest Reward: %.4f\n' % (test_rewards[-1])) # test_steps = 0 # # x = list(range(len(test_rewards))) # plt.plot(x, test_rewards, '-bo') # plt.xlabel('Time') # plt.ylabel('Average Reward') # plt.title('Testing Curve') # plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png') # plt.close() if count > 0 and count % 30 == 0: self.net.save_model_weights(count, self.dump_dir) def test(self, testing_steps=100, model_file=None, capture=False): if model_file is not None: self.net.load_model(model_file) if capture: self.test_env = gym.wrappers.Monitor(self.test_env, './') epsilon = 0.05 rewards = [] self.test_curr_state = self.test_env.reset() #if self.render: # self.test_env.render() cum_reward = 0.0 for i in range(testing_steps): # Initializing the episodes Vt, St = self.get_input_tensor(self.test_curr_state) q_values = self.net.get_Q_output(Vt, St) action = self.epsilon_greedy_policy(q_values, epsilon) # Executing action in simulator nextstate, reward, _, _ = self.test_env.step(action) #if self.render: # self.test_env.render() cum_reward += reward self.test_curr_state = nextstate avg_reward = cum_reward / float(testing_steps) rewards.append(avg_reward) return avg_reward def burn_in_memory(self, curr_state): # Initialize your replay memory with a burn_in number of episodes / transitions. cnt = 0 while self.burn_in > cnt: # Randomly selecting action for burn in. Not sure if this is correct. action = self.env.action_space.sample() next_state, reward, _, _ = self.env.step(action) self.replay_buffer.add(curr_state, action, reward / 100.0, next_state, 0) curr_state = next_state cnt = cnt + 1 return curr_state def get_input_tensor(self, obs): ''' Returns an input tensor from the observation. ''' iV = np.zeros((1, 3, 11, 11)) iS = np.zeros((1, 4)) iV[0] = np.moveaxis(obs['vision'], -1, 0) iS[0] = np.concatenate((obs['scent'], np.array([int(obs['moved'])])), axis=0) iVt, iSt = torch.from_numpy(iV).float(), torch.from_numpy(iS).float() return iVt, iSt def get_input_tensors(self, batch, next_state=False): ''' Returns an input tensor created from the sampled batch. ''' V = np.zeros((self.batch_size, 3, 11, 11)) S = np.zeros((self.batch_size, 4)) for i, tran in enumerate(batch): if next_state: obs = tran[3] # next state else: obs = tran[0] # current state V[i] = np.moveaxis(obs['vision'], -1, 0) S[i] = np.concatenate( (obs['scent'], np.array([int(obs['moved'])])), axis=0) Vt, St = torch.from_numpy(V).float(), torch.from_numpy(S).float() return Vt, St def get_output_tensors(self, batch): ''' Returns an output tensor created from the sampled batch. ''' Y = np.zeros(self.batch_size) Vt, St = self.get_input_tensors(batch, next_state=True) q_values_a = self.net.get_Q_output(Vt, St) q_values_e = self.net.get_target_output(Vt, St) for i, tran in enumerate(batch): action = self.greedy_policy(q_values_a[i]) Y[i] = tran[2] + self.df * q_values_e[i][action] Yt = torch.from_numpy(Y).float() return Yt
def __init__(self, user_num, action_dim, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1) self.user_num = user_num self.use_budget_control = use_budget_control self.action_dim = action_dim self.n_actions = 11 self.n_features = n_features self.lr = 0.001 self.update_times_per_train = update_times_per_train self.epsilon = 0.5 self.epsilon_min = 0.01 self.epsilon_dec = 0.2 self.epsilon_dec_iter = 100 self.epsilon_clip = 0.2 self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "PPO-model" self.epoch = 0 self.lam = 0.5 self.update_step = 1 self.kl_target = 0.01 self.gamma = 1. self.method = 'clip' self.policy_logvar = 1e-7 self.decay_rate = 0.9 self.decay_steps = 5000 self.global_ = tf.Variable(tf.constant(0)) self.buffer_size = 1000 * max_trajectory_length self.batch_size = 500 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name)
class PPO_interface(LearningAgent, PIDAgent): def __init__(self, user_num, action_dim, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1) self.user_num = user_num self.use_budget_control = use_budget_control self.action_dim = action_dim self.n_actions = 11 self.n_features = n_features self.lr = 0.001 self.update_times_per_train = update_times_per_train self.epsilon = 0.5 self.epsilon_min = 0.01 self.epsilon_dec = 0.2 self.epsilon_dec_iter = 100 self.epsilon_clip = 0.2 self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "PPO-model" self.epoch = 0 self.lam = 0.5 self.update_step = 1 self.kl_target = 0.01 self.gamma = 1. self.method = 'clip' self.policy_logvar = 1e-7 self.decay_rate = 0.9 self.decay_steps = 5000 self.global_ = tf.Variable(tf.constant(0)) self.buffer_size = 1000 * max_trajectory_length self.batch_size = 500 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.r_gmv = tf.placeholder(tf.float32, [None, ], name='r_gmv') self.r_cost = tf.placeholder(tf.float32, [None, ], name='r_cost') self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr") self.r = tf.placeholder(tf.float32, [None, ], name='r') self.a = tf.placeholder(tf.int32, [None, ], name='a') self.adv = tf.placeholder(tf.float32, [None, ], name='advantage') self.done = tf.placeholder(tf.float32, [None, ], name='done') self.gmv_return_value = tf.placeholder(tf.float32, [None, ], name='gmv_return') self.cost_return_value = tf.placeholder(tf.float32, [None, ], name='cost_return') self.return_value = tf.placeholder(tf.float32, [None, ], name='return') self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight") self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net") self.critic_gmv = self._build_q_net(self.s, variable_scope="critic_eval_gmv_net") self.critic_cost = self._build_q_net(self.s, variable_scope="critic_eval_cost_net") self.critic = self.critic_gmv - self.roi_thr * self.critic_cost ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) print(ae_params) print(at_params) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)]) self._build_loss() self._pick_loss() with tf.variable_scope('train'): self.gmv_ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.gmv_loss) self.cost_ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.cost_loss) self.ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss) self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss) with tf.variable_scope('roi'): self.max_longterm_roi = self.critic_gmv / (self.critic_cost + 1e-4) def _pick_loss(self): self.has_target_net = True self.critic_loss = self.closs self.gmv_loss = self.gmv_closs self.cost_loss = self.cost_closs self.actor_loss = self.aloss def _build_loss(self): with tf.variable_scope('critic'): self.gmv_c_loss = self.gmv_return_value - self.critic_gmv self.cost_c_loss = self.cost_return_value - self.critic_cost self.c_loss = self.return_value - self.critic self.gmv_closs = tf.reduce_mean(tf.square(self.gmv_c_loss)) self.cost_closs = tf.reduce_mean(tf.square(self.cost_c_loss)) self.closs = tf.reduce_mean(tf.square(self.c_loss)) self.advantage = self.return_value - self.critic with tf.variable_scope('surrogate'): a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) pi_prob = tf.gather_nd(params=self.a_eval, indices=a_indices) oldpi_prob = tf.gather_nd(params=self.a_target, indices=a_indices) ratio = pi_prob / (oldpi_prob + 1e-8) surr = ratio * self.adv if self.method == 'kl_pen': kl = tf.distributions.kl_divergence(self.a_target, self.a_eval) self.kl_mean = tf.reduce_mean(kl) self.aloss = -(tf.reduce_mean(surr - self.lam * kl)) else: self.aloss = -tf.reduce_mean(tf.minimum( surr, tf.clip_by_value(ratio, 1. - self.epsilon_clip, 1. + self.epsilon_clip) * self.adv)) def update_target(self, sess): if self.epoch % self.replace_target_iter == 0: sess.run(self.a_target_replace_op) def train(self, sess): if self.has_target_net: self.update_target(sess) self.epoch += 1 if not self._is_exploration_enough(self.batch_size): return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess) else: policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) print("update epsilon:", self.epsilon) return True, [policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns], self.get_memory_returns(), self.epsilon def _build_action_net(self, state, variable_scope): with tf.variable_scope(variable_scope): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] l1 = tf.layers.dense(state, n_features // 2, tf.nn.relu) l2 = tf.layers.dense(l1, n_features // 4, tf.nn.relu) a_prob = tf.layers.dense(l2, self.n_actions, tf.nn.softmax) return a_prob def _build_q_net(self, state, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] l1 = tf.layers.dense(state, n_features // 2, tf.nn.relu) l2 = tf.layers.dense(l1, n_features // 4, tf.nn.relu) v = tf.layers.dense(l2, 1) return v[:, 0] def train_normal(self, sess): policy_loss, policy_entropy = 0, 0 loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0 cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0 if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_latest_index(self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index( sample_indices) obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index( sample_indices) adv = sess.run(self.advantage, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr}) ret = sess.run(self.return_value, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr}) criti = sess.run(self.critic_cost, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr}) [sess.run([self.ctrain_op, self.gmv_ctrain_op, self.cost_ctrain_op], feed_dict={ self.adv: adv, self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.done: done, self.gmv_return_value: gmv_returns, self.cost_return_value: cost_returns, self.return_value: returns, self.roi_thr: roi_thr}) for _ in range(self.update_step)] if self.method == 'kl_pen': for _ in range(self.update_step): _, kl, loss, gmv_eval, cost_eval = sess.run( [self.atrain_op, self.kl_mean, self.closs, self.critic_gmv, self.critic_cost], feed_dict={ self.adv: adv, self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.done: done, self.gmv_return_value: gmv_returns, self.cost_return_value: cost_returns, self.return_value: returns, self.roi_thr: roi_thr}) if kl > 4 * self.kl_target: break if kl < self.kl_target / 1.5: self.lam /= 2 elif kl > self.kl_target * 1.5: self.lam *= 2 self.lam = np.clip(self.lam, 1e-4, 10) else: for _ in range(self.update_step): _, loss, q_eval, gmv_loss, gmv_q_eval, cost_loss, cost_q_eval \ = sess.run( [self.atrain_op, self.closs, self.critic, self.gmv_loss, self.critic_gmv, self.cost_loss, self.critic_cost], feed_dict={ self.adv: adv, self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.done: done, self.gmv_return_value: gmv_returns, self.cost_return_value: cost_returns, self.return_value: returns, self.roi_thr: roi_thr }) return policy_loss, policy_entropy, loss, montecarlo_loss, np.average(q_eval), np.average(returns), \ gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \ cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns) def __make_hardreplace_exp__(self, vals, target_vals): expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(var)) expression = tf.group(*expression) return expression def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): new_trajectory_gmv = other_info["gmv"] new_trajectory_cost = other_info["cost"] if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) add_episode(self.gmv_replay_buffer, new_trajectory_gmv, gamma=self.gamma) add_episode(self.cost_replay_buffer, new_trajectory_cost, gamma=self.gamma) def __epsilon_greedy__(self, sess, observation, roi_thr): if np.random.uniform() < self.epsilon: s = observation[np.newaxis, :] prob_weights = sess.run(self.a_eval, feed_dict={self.s: s}) a = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) bid = a else: bid = self.__greedy__(sess, observation, roi_thr) return bid def __greedy__(self, sess, observation, roi_thr): s = observation[np.newaxis, :] prob_weights = sess.run(self.a_eval, feed_dict={self.s: s}) a = np.argmax(prob_weights, axis=1)[0] bid = a return bid def choose_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi return self.__epsilon_greedy__(sess, observation, roi_thr) def greedy_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi bid = self.__greedy__(sess, observation, roi_thr) if self.use_budget_control: user_idx = other_info["user_idx"] request_idx = other_info["request_idx"] roi_threshold = self.get_roi_threshold() if request_idx == 0: observations = observation[np.newaxis, :] max_plongterm_roi = sess.run( self.max_longterm_roi, feed_dict={ self.s: observations, self.a: [bid] } ) if max_plongterm_roi >= roi_threshold: self.explore_user(user_idx) return bid else: return 0. else: if self.is_user_selected(user_idx): return bid else: return 0 else: return bid def get_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.greedy_action(sess, obs, other_info) else: discrete_action = self.choose_action(sess, obs, other_info) bid_max = MultiUserEnv.bid_max bid_min = MultiUserEnv.bid_min other_action_info = { "learning_action": discrete_action } return bid_min + (bid_max - bid_min) / (self.n_actions - 1) * discrete_action, other_action_info def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def _is_exploration_enough(self, min_pool_size): if self.use_prioritized_experience_replay: return len(self.prioritized_replay_buffer) >= min_pool_size else: return len(self.replay_buffer) >= min_pool_size
def main(args): '''Initialize replay buffer, models, and environment.''' if args.logging > 0: import time replay_state_dim = 12 if args.door == 1 or args.door == 3: replay_state_dim += 1 elif args.door == 5: replay_state_dim += 1 + 3 * 2 elif args.drawer: replay_state_dim += 3 * 3 + 1 if not args.robot: replay_buffer = ReplayBuffer( max_replay_buffer_size = args.replay_buffer_size, trajectory_length=args.traj_length, state_dim=replay_state_dim, action_dim=args.action_dim, savedir=args.log_dir, ) img_buffer = ImageBuffer( # 3x64x64 pixels trajectory_length=args.num_traj_per_epoch*args.traj_length, action_dim=args.action_dim, savedir=args.log_dir, memory_size=500, ) if args.logging == 0: # no env logging args.verbose = False if args.robot: import gym import franka_env env = gym.make("Franka-v0") else: env = Tabletop( log_freq=args.env_log_freq, filepath=args.log_dir + '/env', door=args.door, drawer=args.drawer, hard=args.hard, verbose=args.verbose) if args.logging == 2: viz_env = Tabletop( door=args.door, drawer=args.drawer, hard=args.hard, log_freq=args.env_log_freq, filepath=None, verbose=False) else: viz_env = None ''' Initialize models ''' enc_dec = SimpleVAE(device, args.latent_dim, args.log_dir) if args.reload is not None: enc_dec.load_state_dict(torch.load(args.reload + '/enc_dec/{}model.bin'.format(args.reload_epoch))) enc_dec.to(device) enc_params = list(enc_dec.params) enc_optimizer = optim.Adam(enc_params, lr=1e-3) # just for enc_dec dynamics_models = None if args.dynamics_var: dynamics_models = [] dyn_params = None for a in range(5): dynamics_model = TransitionModel(args.latent_dim, args.action_dim, args.log_dir, num=a) dynamics_model.to(device) dynamics_models.append(dynamics_model) if a == 0: dyn_params = list(dynamics_model.params) else: dyn_params += list(dynamics_model.params) else: dynamics_model = TransitionModel(args.latent_dim, args.action_dim, args.log_dir, recurrent=False) if args.reload is not None: dynamics_model.load_state_dict(torch.load(args.reload + '/dynamics_model/{}model.bin'.format(args.reload_epoch))) dynamics_model.to(device) dyn_params = list(dynamics_model.params) dyn_optimizer = optim.Adam(dyn_params, lr=1e-3) # just for transition model # If using Classifiers classifiers = None if args.use_classifiers is not None: classifiers = [] for i in range(args.num_classifiers): # classifier = BinClassifier(args.latent_dim, args.log_dir + '/classifier', i) if args.instance_normalized: classifier = BinClassifier_InsNorm(args.latent_dim, args.log_dir + '/classifier', i) else: classifier = BinClassifier(args.latent_dim, args.log_dir + '/classifier', i) if args.reload is not None: classifier.load_state_dict(torch.load(args.reload + '/classifier/{}/{}model.bin'.format(i, args.reload_epoch))) classifier.to(device) classifiers.append(classifier) if i == 0: c_params = list(classifier.params) else: c_params += list(classifier.params) c_optimizer = optim.Adam(c_params, lr=1e-3) # If using SMM density_vae = None goal_vae = None if args.smm: density_vae = VAEGoal(args.log_dir + '/density_model') goal_vae = VAEGoal(args.log_dir + '/goal') density_vae.to(device) goal_vae.to(device) d_params = list(density_vae.params) g_params = list(goal_vae.params) g_optimizer = optim.Adam(d_params, lr=1e-3) d_optimizer = optim.Adam(g_params, lr=1e-3) ''' Return goals ''' goals = np.array(get_goal_imgs(args, env, filepath=args.log_dir + '/goal_ims')) goals = goals / 255. goals = torch.tensor(goals).float().to(device) goals = goals.permute(0, 3, 1, 2) # If flag 0, no training losses log if args.logging > 0: hist = Hist(args.log_dir) env.max_path_length = args.traj_length * args.num_traj_per_epoch # Clean the env memory to make sure above code isn't affecting the env if not args.robot: env.initialize() ob, env_info = None, None for epoch in gt.timed_for(range(args.num_epochs), save_itrs=True): if args.logging > 0: start = time.time() init_low_dim = None obs_sample = [] high_dim_sample = [] ob, env_info = env.reset_model(add_noise=args.add_noise) if epoch == 0 and args.logging > 0 and not args.robot: init_array = env.get_obs() * 255. init_img = Image.fromarray(init_array.astype(np.uint8)) init_img.save(args.log_dir + '/init.png') ''' Log low dim state for plotting block interaction bars ''' if not args.robot: init_low_dim = get_obs(args, env_info) obs_sample.append(init_low_dim) init_ob = ob eps_obs = [] eps_next = [] eps_act = [] for i in range(args.num_traj_per_epoch): ob = torch.tensor(ob).unsqueeze(0).permute(0, 3, 1, 2).float().to(device) if i == 0: high_dim_sample.append(ptu.get_numpy(ob.squeeze(0))) if epoch < 100: actions = get_random_action_sequence(env, args.traj_length, sample_sz = 1) actions = ptu.get_numpy(actions).squeeze(0) else: sorted_rewards, sorted_actions, sorted_preds = plan_actions(args, env, ob, dynamics_model, enc_dec, classifiers=classifiers, goal_vae=goal_vae, density_vae=density_vae, dynamics_models=dynamics_models) # Randomly select from the top K with highest reward act = np.random.choice(TOP_K) actions = sorted_actions[act] ''' Log best and worst 3 trajectories (gifs and final state imgs) ''' if args.logging > 0 and epoch % args.model_log_freq == 0: log_rankings(args, enc_dec, hist.rankings_dir, viz_env, init_low_dim, sorted_actions, sorted_preds, epoch, i, sorted_rewards) action_sample = [] for action in actions: # With low probability take a random action rand = np.random.uniform(0.0, 1.0) if rand < args.random_act_prob: action = get_random_action_sequence(env, 1, sample_sz = 1).cpu().detach().numpy() action = action.reshape(args.action_dim) next_ob, reward, terminal, env_info = env.step(action) ob = next_ob next_ob = torch.tensor(next_ob).permute(2, 0, 1).float().to(device).unsqueeze(0) # change to 3 x 64 x 64 obs high_dim_sample.append(ptu.get_numpy(next_ob.squeeze(0))) if not args.robot: obs = get_obs(args, env_info) obs_sample.append(obs) init_low_dim = obs_sample[-1].copy() action_sample.append(action) if not args.robot: replay_buffer.add_sample( states=obs_sample[:-1], next_states=obs_sample[1:], actions=action_sample, ) last_obs = obs_sample[-1] eps_obs.append(high_dim_sample[:-1]) eps_next.append(high_dim_sample[1:]) eps_act.append(action_sample) last_frame = high_dim_sample[-1] obs_sample = [] high_dim_sample = [] # This becomes the init frame of the next traj if not args.robot: obs_sample.append(last_obs) high_dim_sample.append(last_frame) # reshape to -1, EPS SZ 50, 3, 64, 64 eps_obs = np.array(eps_obs).reshape(-1, args.num_traj_per_epoch * args.traj_length, 3, 64, 64) if epoch == 1: with imageio.get_writer(args.log_dir + '/trial.gif', mode='I') as writer: for k, frame in enumerate(eps_obs[0]): img = np.array(frame) img = img.transpose((1, 2, 0)) * 255.0 writer.append_data(img.astype('uint8')) eps_next = np.array(eps_next).reshape(-1, args.num_traj_per_epoch * args.traj_length, 3, 64, 64) eps_act = np.array(eps_act).reshape(-1, args.num_traj_per_epoch * args.traj_length, img_buffer.action_dim) img_buffer.add_sample( states=eps_obs, next_states=eps_next, actions=eps_act, ) # Gradually increase the horizon for training the dynamics model predlen = 10 if epoch < 300: predlen = 8 if epoch < 150: predlen = 4 if epoch < 50: predlen = 2 if epoch % args.update_freq == 0: print("Updating") if args.logging > 0 and epoch % args.loss_log_freq == 0: epoch_dynamics_loss = np.zeros((args.grad_steps_per_update,), dtype=float) epoch_vae_loss = np.zeros((args.grad_steps_per_update,), dtype=float) if args.use_classifiers is not None: epoch_auxillary_loss = np.zeros((args.classifiers_grad_steps,), dtype=float) else: epoch_auxillary_loss = np.zeros((args.grad_steps_per_update,), dtype=float) for grstep in range(args.grad_steps_per_update): losses = [] # Return [batch_sz, predlen, 3, 64, 64] obs, next_obs, actions, success = img_buffer.draw_samples(batch_size=args.batch_sz, length=predlen) obs = torch.tensor(obs).float().to(device) next_obs = torch.tensor(next_obs).float().to(device) actions = torch.tensor(actions).float().to(device) _, _, _, _, obs_z = enc_dec.forward(obs) _, _, _, _, next_z = enc_dec.forward(next_obs) g_ind = np.random.randint(0, len(goals), args.batch_sz) g_samples = goals[g_ind] _, _, _, _, goal_z = enc_dec.forward(g_samples.unsqueeze(1)) if args.dynamics_var: ''' Train dynamics models in disagreement ensemble ''' dynamics_loss = train_disgrmt_ensemble(img_buffer, dynamics_models, enc_dec, dyn_optimizer, args.batch_sz, predlen) auxillary_loss = dynamics_loss else: dynamics_loss, pred_z = train_dynamics_model(dynamics_model, obs_z, next_z, actions, dyn_optimizer) if args.logging > 0 and epoch % args.model_log_freq == 0 and not args.dynamics_var: dynamics_preds = enc_dec.dec(pred_z.float()) # decode pred_z through the decoder & compare with next_obs for num in range(3): dynamics_pred = dynamics_preds[num] dynamics_true = next_obs[num] dynamics_pred = ptu.get_numpy(dynamics_pred.permute(0, 2, 3, 1)) dynamics_true = ptu.get_numpy(dynamics_true.permute(0, 2, 3, 1)) dynamics_true = (dynamics_true * 255.).astype(np.uint8) dynamics_pred = (dynamics_pred * 255.).astype(np.uint8) path = args.log_dir + '/dynamics_preds/' + str(epoch) if not os.path.exists(path): os.makedirs(path) with imageio.get_writer(path + '/train_true' + str(num) + '.gif', mode='I') as writer: for e in range(len(dynamics_true)): writer.append_data(dynamics_true[e]) with imageio.get_writer(path + '/train_pred' + str(num) + '.gif', mode='I') as writer: for e in range(len(dynamics_pred)): writer.append_data(dynamics_pred[e]) ''' Train classifiers ''' if args.use_classifiers is not None and grstep < args.classifiers_grad_steps: score_path = None if args.logging > 0 and epoch % args.model_log_freq == 0: score_path = args.log_dir + '/classifier_scores/' + str(epoch) if not os.path.exists(score_path): os.makedirs(score_path) auxillary_loss = train_classifiers(classifiers, enc_dec, obs, goals, c_optimizer, args.batch_sz, score_path) ''' Update SMM density models ''' if args.smm: auxillary_loss = train_smm_density_models(density_vae, goal_vae, obs_z, goal_z, d_optimizer, g_optimizer) '''Train main vae''' vae_loss, g_rec, ng_rec = train_vae(args, enc_dec, obs, g_samples, enc_optimizer, args.beta) if args.logging > 0 and epoch % args.model_log_freq == 0: # save g_rec & g_samples if g_rec is not None: g_rec = g_rec.cpu().detach() g_rec = g_rec * 255.0 r_imgs = g_rec.squeeze(1).permute(0, 2, 3, 1).reshape(-1, 64, 64, 3) r_imgs = ptu.get_numpy(r_imgs).astype(np.uint8) g_true = g_samples * 255.0 t_imgs = g_true.permute(0, 2, 3, 1).reshape(-1, 64, 64, 3) t_imgs = ptu.get_numpy(t_imgs).astype(np.uint8) for im in range(5): img = Image.fromarray(r_imgs[im]) path = args.log_dir + '/vae_recs/' + str(epoch) if not os.path.exists(path): os.makedirs(path) img.save(path + '/g_rec' + str(im) + '.png') img = Image.fromarray(t_imgs[im]) img.save(path + '/g_true' + str(im) + '.png') ng_rec = ng_rec * 255.0 r_imgs = ng_rec.squeeze(1).permute(0, 2, 3, 1).reshape(-1, 64, 64, 3) r_imgs = ptu.get_numpy(r_imgs).astype(np.uint8) ng_true = obs[:,0,:,:,:] * 255.0 t_imgs = ng_true.permute(0, 2, 3, 1).reshape(-1, 64, 64, 3) t_imgs = ptu.get_numpy(t_imgs).astype(np.uint8) for im in range(5): img = Image.fromarray(r_imgs[im]) path = args.log_dir + '/vae_recs/' + str(epoch) if not os.path.exists(path): os.makedirs(path) img.save(path + '/ng_rec' + str(im) + '.png') img = Image.fromarray(t_imgs[im]) img.save(path + '/ng_true' + str(im) + '.png') if args.logging > 0 and epoch % args.loss_log_freq == 0: epoch_dynamics_loss[grstep] = dynamics_loss epoch_vae_loss[grstep] = vae_loss if args.dynamics_var or args.smm or grstep < args.classifiers_grad_steps: epoch_auxillary_loss[grstep] = auxillary_loss if args.logging > 0: end = time.time() print("===== EPISODE {} FINISHED IN {}s =====".format(epoch, end - start)) if args.logging > 0 and epoch % args.loss_log_freq == 0 and epoch > 0: hist.save_losses( epoch_auxillary_loss.mean(), # e.g. SMM, disagreement, classifiers max epoch_dynamics_loss.mean(), epoch_vae_loss.mean(), ) if args.logging == 2: print(hist.report_losses) if epoch % args.model_log_freq == 0: torch.save(enc_dec.state_dict(), enc_dec.savedir + '/{}model.bin'.format(epoch)) if args.dynamics_var: for model in dynamics_models: torch.save(model.state_dict(), model.savedir + '/{}model.bin'.format(epoch)) else: torch.save(dynamics_model.state_dict(), dynamics_model.savedir + '/{}model.bin'.format(epoch)) if args.use_classifiers is not None: for classifier in classifiers: torch.save(classifier.state_dict(), classifier.savedir + '/{}model.bin'.format(epoch)) if args.smm: torch.save(goal_vae.state_dict(), goal_vae.savedir + '/{}model.bin'.format(epoch)) torch.save(density_vae.state_dict(), density_vae.savedir + '/{}model.bin'.format(epoch)) if args.logging > 0: hist.save_losses_txt() if epoch == args.max_epoch: assert(False)
class DQN_interface(LearningAgent): def __init__( self, n_actions=11, n_features=29, use_prioritized_experience_replay=True, max_trajectory_length=20, ): self.n_actions = n_actions self.n_features = n_features self.gamma = 1. self.lr = 0.001 self.epsilon = 0.5 self.epsilon_min = 0 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 1000 self.replace_target_iter = 100 self.soft_update_iter = 1 self.softupdate = False self.scope_name = "DQN-model" self.epoch = 0 self.buffer_size = 5000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.margin_constant = 2 with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') self.r = tf.placeholder(tf.float32, [ None, ], name='r') self.a = tf.placeholder(tf.int32, [ None, ], name='a') self.done = tf.placeholder(tf.float32, [ None, ], name='done') self.return_value = tf.placeholder(tf.float32, [ None, ], name='return') self.important_sampling_weight_ph = tf.placeholder( tf.float32, [None], name="important_sampling_weight") self.q_eval = self._build_q_net(self.s, self.n_actions, variable_scope="eval_net") self.q_next = self._build_q_net(self.s_, self.n_actions, variable_scope="target_net") t_params = scope_vars(absolute_scope_name("target_net")) e_params = scope_vars(absolute_scope_name("eval_net")) with tf.variable_scope('hard_replacement'): self.target_replace_op = tf.group( [tf.assign(t, e) for t, e in zip(t_params, e_params)]) with tf.variable_scope('soft_update'): self.update_target_q = self.__make_update_exp__(e_params, t_params) with tf.variable_scope('q_target'): self.td0_q_target = tf.stop_gradient( self.r + self.gamma * (1. - self.done) * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')) target_action = tf.argmax(self.q_eval, axis=-1, output_type=tf.int32) target_a_indices = tf.stack( [tf.range(tf.shape(self.a)[0], dtype=tf.int32), target_action], axis=1) target_q_sa = tf.gather_nd(params=self.q_next, indices=target_a_indices) self.double_dqn_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * target_q_sa) self.montecarlo_target = self.return_value with tf.variable_scope('q_eval'): a_indices = tf.stack( [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) with tf.variable_scope('loss'): self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.loss, var_list=e_params) def _pick_loss(self): self.loss = self.double_dqn_loss self.priority_values = self.doubel_dqn_error def _build_loss(self): if self.use_prioritized_experience_replay: self.dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference( self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss')) self.double_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.double_dqn_target, self.q_eval_wrt_a, name='Double_DQN_error')) else: self.dqn_loss = tf.reduce_mean( tf.squared_difference(self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss')) self.double_dqn_loss = tf.reduce_mean( tf.squared_difference(self.double_dqn_target, self.q_eval_wrt_a, name='Double_DQN_error')) self.montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_target, self.q_eval_wrt_a, name='MonteCarlo_error')) self.td0_error = tf.abs(self.td0_q_target - self.q_eval_wrt_a) self.doubel_dqn_error = tf.abs(self.double_dqn_target - self.q_eval_wrt_a) self.montecarlo_error = tf.abs(self.montecarlo_target - self.q_eval_wrt_a) margin_diff = tf.one_hot(self.a, self.n_actions, on_value=0., off_value=1., dtype=tf.float32) * self.margin_constant self.margin_loss = tf.reduce_mean( tf.reduce_max(self.q_eval + margin_diff, axis=1, keepdims=False) - self.q_eval_wrt_a) self.mse_margin_loss = tf.reduce_mean( tf.squared_difference( tf.reduce_max(self.q_eval + margin_diff, axis=1, keepdims=False), self.q_eval_wrt_a)) def _build_q_net(self, state, n_actions, variable_scope): with tf.variable_scope(variable_scope): fc1 = tf.layers.dense(state, units=self.n_features, activation=tf.nn.relu, name='fc1') q_out = tf.layers.dense(fc1, units=n_actions, name='q') return q_out def __make_update_exp__(self, vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return expression def __make_hardreplace_exp__(self, vals, target_vals): expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(var)) expression = tf.group(*expression) return expression def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) def get_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.greedy_action(sess, obs) else: discrete_action = self.choose_action(sess, obs) other_action_info = {"learning_action": discrete_action} return 3 * discrete_action, other_action_info def choose_action(self, sess, observation): observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: action = np.random.randint(0, self.n_actions) else: actions_value = sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value, axis=1)[0] return action def greedy_action(self, sess, single_observation): observation = single_observation[np.newaxis, :] actions_value = sess.run(self.q_eval, feed_dict={self.s: observation}) greedy_action = np.argmax(actions_value, axis=1)[0] return greedy_action def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def _is_exploration_enough(self, min_pool_size): if self.use_prioritized_experience_replay: return len(self.prioritized_replay_buffer) >= min_pool_size else: return len(self.replay_buffer) >= min_pool_size def update_target(self, sess): if self.softupdate: if self.epoch % self.soft_update_iter == 0: sess.run(self.update_target_q) else: if self.epoch % self.replace_target_iter == 0: sess.run(self.target_replace_op) def train(self, sess): self.update_target(sess) self.epoch += 1 if not self._is_exploration_enough(self.batch_size): return False, [0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: loss, montecarlo_loss, q_eval, returns = self.train_prioritized( sess) else: loss, montecarlo_loss, q_eval, returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) print("update epsilon:", self.epsilon) return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon def train_prioritized(self, sess): loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0 for idx in range(1): sample_indices = self.prioritized_replay_buffer.make_index( self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index( sample_indices) _, loss, q_eval, montecarlo_loss, priority_values = sess.run( [ self._train_op, self.loss, self.q_eval_wrt_a, self.montecarlo_loss, self.priority_values ], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, self.important_sampling_weight_ph: weights }) priorities = priority_values + 1e-6 self.prioritized_replay_buffer.update_priorities( sample_indices, priorities) return loss, montecarlo_loss, np.average(q_eval), np.average(returns) def train_normal(self, sess): loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0 for idx in range(1): sample_index = self.replay_buffer.make_index(self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_index) _, loss, q_eval, montecarlo_loss = sess.run( [ self._train_op, self.loss, self.q_eval_wrt_a, self.montecarlo_loss ], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, }) return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
class DQNAgent: def __init__(self, gamma, action_number, minibatch, episodes, begin_train, train_step, begin_copy, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, episode_steps, episode_to_save, max_buffer_len): # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch self.action_number = action_number self.gamma = gamma # Episode Params self.begin_train = begin_train self.begin_copy = begin_copy self.copy_step = copy_step self.train_step = train_step self.episodes = episodes self.episode_steps = episode_steps self.episode_to_save = episode_to_save # I/O params self.path_to_load = path_to_load self.path_to_save = path_to_save self.load_model = load_model # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # self.device = torch.device('cpu') self.model = BoxModel((150, 100, 1), action_number).to(self.device) if self.load_model: self.model.load_state_dict(torch.load(self.path_to_load)) # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], [] def reduce_epsilon(self, episode): self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * episode / self.epsilon_delta) def epsilon_greedy(self): if (1 - self.epsilon) <= np.random.random(): self.action = np.random.randint(self.action_number) else: state = torch.autograd.Variable( torch.FloatTensor(self.state).to(self.device).unsqueeze(0)) self.action = self.model(state).max(1)[1].item() return self.action @staticmethod def preprocess_observation(observation): rgb = observation[30:180, 30:130] / 255 r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return gray.reshape(1, 150, 100) def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done): return \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \ torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device)) def train_model(self): o_state, o_act, o_reward, o_next_state, o_done = \ self.transition_process(*self.replay_buffer.sample(self.minibatch)) q = self.model(o_state) q_next = self.model(o_next_state) y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done) loss = (q.gather(1, o_act.unsqueeze(1)).squeeze(1) - torch.autograd.Variable(y_hat.data)).pow(2).mean() self.model.optimizer.zero_grad() loss.backward() self.model.optimizer.step() def print(self, episode, reward_black, reward_white, epsilon): print(f"For episode {episode} reward white - " f"{reward_white} and black - {reward_black}," f"epsilon - {epsilon}") def train(self, env: gym.wrappers.time_limit.TimeLimit): start = time() print("Begin to Train") for episode in range(self.episodes): observation = env.reset() self.state = self.preprocess_observation(observation) reward_black, reward_white, total_reward = 0, 0, 0 for episode_steps in range(self.episode_steps): action = self.epsilon_greedy() next_observation, reward, done, _ = env.step(action) reward_black += (reward < 0) * abs(reward) reward_white += (reward > 0) * reward total_reward += reward next_state = self.preprocess_observation(next_observation) self.replay_buffer.push(self.state, action, reward, next_state, done) if len(self.replay_buffer) >= self.begin_train: self.train_model() # if (episode_step >= self.begin_copy) and (episode_step % self.copy_step == 0): # plt.plot(total_reward) # plt.show() # self.const_model = self.model.clone() if done: break self.reduce_epsilon(episode) if episode != 0 and episode % self.episode_to_save == 0: torch.save(self.model.state_dict(), self.path_to_save) plt.plot(self.rewards) plt.show() self.rewards_black.append(reward_black) self.rewards_white.append(reward_white) self.rewards.append(total_reward) self.print(episode, reward_black=reward_black, reward_white=reward_white, epsilon=self.epsilon) print(time() - start) def play(self, env: gym.wrappers.time_limit.TimeLimit): observation = env.reset() reward_black, reward_white, total_reward = 0, 0, 0 for episode_steps in range(self.episode_steps): state = self.preprocess_observation(observation) state = torch.autograd.Variable( torch.FloatTensor(state).to(self.device).unsqueeze(0)) print(self.model(state)) action = self.model(state).max(1)[1].item() observation, reward, done, _ = env.step(action) reward_black += (reward < 0) * abs(reward) reward_white += (reward > 0) * reward total_reward += reward sleep(0.01) env.render() if done: break print(total_reward)
class DQN2Net_interface(LearningAgent, PIDAgent): def __init__( self, user_num, n_actions, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train=1, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=2) self.user_num = user_num self.use_budget_control = use_budget_control self.update_times_per_train = update_times_per_train self.n_actions = n_actions self.n_features = n_features self.gamma = 1. self.lr = 0.001 self.user_based_adjust_times = 40 self.epsilon = 0.4 self.epsilon_min = 0.05 self.epsilon_dec = 0.1 self.epsilon_dec_iter = 5000 // self.user_based_adjust_times self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times self.replace_target_iter = 1 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "DQN-model" self.epoch = 0 self.buffer_size = 1000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.margin_constant = 2 with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_q_net(self, state, n_actions, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense( state, units=n_features, activation=tf.nn.relu, name='fc1', kernel_initializer=initializers.xavier_initializer()) fc2 = tf.layers.dense( fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2', kernel_initializer=initializers.xavier_initializer()) fc3 = tf.layers.dense( fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3', kernel_initializer=initializers.xavier_initializer()) q_out = tf.maximum( tf.layers.dense( fc3, units=n_actions, name='q', kernel_initializer=initializers.xavier_initializer()), 0) return q_out def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') self.r_gmv = tf.placeholder(tf.float32, [ None, ], name='r_gmv') self.r_cost = tf.placeholder(tf.float32, [ None, ], name='r_cost') self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr") self.r = tf.placeholder(tf.float32, [ None, ], name='r') self.a = tf.placeholder(tf.int32, [ None, ], name='a') self.done = tf.placeholder(tf.float32, [ None, ], name='done') self.return_gmv_value = tf.placeholder(tf.float32, [ None, ], name='return_gmv') self.return_cost_value = tf.placeholder(tf.float32, [ None, ], name='return_cost') self.return_value = tf.placeholder(tf.float32, [ None, ], name='return') self.important_sampling_weight_ph = tf.placeholder( tf.float32, [None], name="important_sampling_weight") self.q_eval_gmv = self._build_q_net(self.s, self.n_actions, variable_scope="eval_gmv_net") self.q_next_gmv = self._build_q_net(self.s_, self.n_actions, variable_scope="target_gmv_net") self.q_eval_cost = self._build_q_net(self.s, self.n_actions, variable_scope="eval_cost_net") self.q_next_cost = self._build_q_net(self.s_, self.n_actions, variable_scope="target_cost_net") self.q_eval = self.q_eval_gmv - self.roi_thr * self.q_eval_cost self.q_next = self.q_next_gmv - self.roi_thr * self.q_next_cost t_gmv_params = scope_vars(absolute_scope_name("target_gmv_net")) e_gmv_params = scope_vars(absolute_scope_name("eval_gmv_net")) t_cost_params = scope_vars(absolute_scope_name("target_cost_net")) e_cost_params = scope_vars(absolute_scope_name("eval_cost_net")) with tf.variable_scope('hard_replacement'): self.target_gmv_replace_op = tf.group( [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)]) self.target_cost_replace_op = tf.group([ tf.assign(t, e) for t, e in zip(t_cost_params, e_cost_params) ]) with tf.variable_scope('soft_update'): self.update_gmv_target_q = self.__make_update_exp__( e_gmv_params, t_gmv_params) self.update_cost_target_q = self.__make_update_exp__( e_cost_params, t_cost_params) with tf.variable_scope('q_target'): greedy_action_s_ = tf.argmax(self.q_next, axis=-1, name="td0_argmax_action", output_type=tf.int32) greedy_a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), greedy_action_s_ ], axis=1) target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv, indices=greedy_a_indices) target_q_cost_sa = tf.gather_nd(params=self.q_next_cost, indices=greedy_a_indices) target_q_sa = tf.gather_nd(params=self.q_next, indices=greedy_a_indices) self.td0_q_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma * (1. - self.done) * target_q_gmv_sa) self.td0_q_cost_target = tf.stop_gradient(self.r_cost + self.gamma * (1. - self.done) * target_q_cost_sa) self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * target_q_sa) target_action = tf.argmax(self.q_eval, axis=-1, name="doubeldqn_argmax_action", output_type=tf.int32) target_a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), target_action ], axis=1) ddqn_target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv, indices=target_a_indices) ddqn_target_q_cost_sa = tf.gather_nd(params=self.q_next_cost, indices=target_a_indices) ddqn_target_q_sa = tf.gather_nd(params=self.q_next, indices=target_a_indices) self.double_dqn_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma * (1. - self.done) * ddqn_target_q_gmv_sa) self.double_dqn_cost_target = tf.stop_gradient( self.r_cost + self.gamma * (1. - self.done) * ddqn_target_q_cost_sa) self.double_dqn_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * ddqn_target_q_sa) self.montecarlo_gmv_target = self.return_gmv_value self.montecarlo_cost_target = self.return_cost_value self.montecarlo_target = self.return_value with tf.variable_scope('q_eval'): a_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), self.a ], axis=1) self.q_eval_gmv_wrt_a = tf.gather_nd(params=self.q_eval_gmv, indices=a_indices) self.q_eval_cost_wrt_a = tf.gather_nd(params=self.q_eval_cost, indices=a_indices) self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) with tf.variable_scope('loss'): self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.loss, var_list=e_gmv_params + e_cost_params) self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize( self.gmv_loss, var_list=e_gmv_params) self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize( self.cost_loss, var_list=e_cost_params) with tf.variable_scope('roi'): greedy_action_indices = tf.stack([ tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32), dtype=tf.int32), self.a ], axis=1) self.plongterm_roi = tf.gather_nd( params=self.q_eval_gmv, indices=greedy_action_indices) / ( tf.gather_nd(params=self.q_eval_cost, indices=greedy_action_indices) + 1e-6) def _pick_loss(self): self.has_target_net = True self.gmv_loss = self.gmv_double_dqn_loss self.cost_loss = self.cost_double_dqn_loss self.loss = self.double_dqn_loss self.priority_values = self.gmv_doubel_dqn_error + self.cost_doubel_dqn_error + self.doubel_dqn_error def _build_loss(self): if self.use_prioritized_experience_replay: self.gmv_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.td0_q_gmv_target, self.q_eval_gmv_wrt_a, name='TD0_gmv_loss')) self.cost_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.td0_q_cost_target, self.q_eval_cost_wrt_a, name='TD0_cost_loss')) self.dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference( self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss')) self.gmv_double_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.double_dqn_gmv_target, self.q_eval_gmv_wrt_a, name='Double_DQN_gmv_loss')) self.cost_double_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.double_dqn_cost_target, self.q_eval_cost_wrt_a, name='Double_DQN_cost_loss')) self.double_dqn_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.double_dqn_target, self.q_eval_wrt_a, name='Double_DQN_error')) self.gmv_montecarlo_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.montecarlo_gmv_target, self.q_eval_gmv_wrt_a, name='GMV_error')) self.cost_montecarlo_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.montecarlo_cost_target, self.q_eval_cost_wrt_a, name='COST_error')) self.montecarlo_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.montecarlo_target, self.q_eval_wrt_a, name='MonteCarlo_error')) else: self.gmv_dqn_loss = tf.reduce_mean( tf.squared_difference(self.td0_q_gmv_target, self.q_eval_gmv_wrt_a, name='TD0_gmv_loss')) self.cost_dqn_loss = tf.reduce_mean( tf.squared_difference(self.td0_q_cost_target, self.q_eval_cost_wrt_a, name='TD0_cost_loss')) self.dqn_loss = tf.reduce_mean( tf.squared_difference(self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss')) self.gmv_double_dqn_loss = tf.reduce_mean( tf.squared_difference(self.double_dqn_gmv_target, self.q_eval_gmv_wrt_a, name='Double_DQN_gmv_loss')) self.cost_double_dqn_loss = tf.reduce_mean( tf.squared_difference(self.double_dqn_cost_target, self.q_eval_cost_wrt_a, name='Double_DQN_cost_loss')) self.double_dqn_loss = tf.reduce_mean( tf.squared_difference(self.double_dqn_target, self.q_eval_wrt_a, name='Double_DQN_error')) self.gmv_montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_gmv_target, self.q_eval_gmv_wrt_a, name='MonteCarlo_gmv_loss')) self.cost_montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_cost_target, self.q_eval_cost_wrt_a, name='MonteCarlo_cost_loss')) self.montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_target, self.q_eval_wrt_a, name='MonteCarlo_error')) self.gmv_td0_error = tf.abs(self.td0_q_gmv_target - self.q_eval_gmv_wrt_a) self.cost_td0_error = tf.abs(self.td0_q_cost_target - self.q_eval_cost_wrt_a) self.td0_error = tf.abs(self.td0_q_target - self.q_eval_wrt_a) self.gmv_doubel_dqn_error = tf.abs(self.double_dqn_gmv_target - self.q_eval_gmv_wrt_a) self.cost_doubel_dqn_error = tf.abs(self.double_dqn_cost_target - self.q_eval_cost_wrt_a) self.doubel_dqn_error = tf.abs(self.double_dqn_target - self.q_eval_wrt_a) self.gmv_montecarlo_error = tf.abs(self.montecarlo_gmv_target - self.q_eval_gmv_wrt_a) self.cost_montecarlo_error = tf.abs(self.montecarlo_cost_target - self.q_eval_cost_wrt_a) self.montecarlo_error = tf.abs(self.montecarlo_target - self.q_eval_wrt_a) def __make_update_exp__(self, vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return expression def __make_hardreplace_exp__(self, vals, target_vals): expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(var)) expression = tf.group(*expression) return expression def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): new_trajectory_gmv = other_info["gmv"] new_trajectory_cost = other_info["cost"] if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) add_episode(self.gmv_replay_buffer, new_trajectory_gmv, gamma=self.gamma) add_episode(self.cost_replay_buffer, new_trajectory_cost, gamma=self.gamma) def get_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.greedy_action(sess, obs, other_info) else: discrete_action = self.choose_action(sess, obs, other_info) bid_max = MultiUserEnv.bid_max bid_min = MultiUserEnv.bid_min other_action_info = {"learning_action": discrete_action} return bid_min + (bid_max - bid_min) / ( self.n_actions - 1) * discrete_action, other_action_info def __greedy__(self, sess, observation, roi_thr): observations = observation[np.newaxis, :] actions_value = sess.run(self.q_eval, feed_dict={ self.s: observations, self.roi_thr: roi_thr }) greedy_action = np.argmax(actions_value, axis=1)[0] return greedy_action def __epsilon_greedy__(self, sess, observation, roi_thr): if np.random.uniform() < self.epsilon: action = np.random.randint(0, self.n_actions) else: action = self.__greedy__(sess, observation, roi_thr) return action def choose_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi return self.__epsilon_greedy__(sess, observation, roi_thr) def greedy_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi greedy_action = self.__greedy__(sess, observation, roi_thr) if self.use_budget_control: user_idx = other_info["user_idx"] request_idx = other_info["request_idx"] roi_threshold = self.get_roi_threshold() if request_idx == 0: observations = np.expand_dims(observation, axis=0) max_plongterm_roi = sess.run(self.plongterm_roi, feed_dict={ self.s: observations, self.a: [greedy_action], }) if max_plongterm_roi >= roi_threshold: self.explore_user(user_idx) return greedy_action else: return 0 else: if self.is_user_selected(user_idx): return greedy_action else: return 0 else: return greedy_action def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def _is_exploration_enough(self, min_pool_size): if self.use_prioritized_experience_replay: return len(self.prioritized_replay_buffer) >= min_pool_size else: return len(self.replay_buffer) >= min_pool_size def update_target(self, sess): if self.softupdate: if self.epoch % self.soft_update_iter == 0: sess.run(self.update_gmv_target_q) sess.run(self.update_cost_target_q) else: if self.epoch % self.replace_target_iter == 0: sess.run(self.target_gmv_replace_op) sess.run(self.target_cost_replace_op) def train(self, sess): if self.has_target_net: self.update_target(sess) self.epoch += 1 if not self._is_exploration_enough(self.batch_size): return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess) else: loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) self.epsilon_dec_iter //= 1.5 self.epsilon_dec_iter = max(self.epsilon_dec_iter, self.epsilon_dec_iter_min) print("update epsilon:", self.epsilon) return True, [ 0, 0, loss, montecarlo_loss, q_eval, returns, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns ], self.get_memory_returns(), self.epsilon def train_prioritized(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0 cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0 if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi for idx in range(self.update_times_per_train): sample_indices = self.prioritized_replay_buffer.make_index( self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index( sample_indices) obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns, weights, ranges = self.gmv_replay_buffer.sample_index( sample_indices) obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval, \ _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \ _2, cost_loss, cost_montecarlo_loss, cost_q_eval, \ priority_values = sess.run( [self._train_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a, self._train_gmv_op, self.gmv_loss, self.gmv_montecarlo_loss, self.q_eval_gmv_wrt_a, self._train_cost_op, self.cost_loss, self.cost_montecarlo_loss, self.q_eval_cost_wrt_a, self.priority_values], feed_dict={ self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.s_: obs_next, self.done: done, self.return_gmv_value: gmv_returns, self.return_cost_value: cost_returns, self.return_value: returns, self.important_sampling_weight_ph: weights, self.roi_thr: roi_thr }) priorities = priority_values + 1e-6 self.prioritized_replay_buffer.update_priorities( sample_indices, priorities) return loss, montecarlo_loss, np.average(q_eval), np.average(returns), \ gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \ cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns) def train_normal(self, sess): loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0 cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0 if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_index(self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index( sample_indices) obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval, \ _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \ _2, cost_loss, cost_montecarlo_loss, cost_q_eval \ = sess.run( [self._train_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a, self._train_gmv_op, self.gmv_loss, self.gmv_montecarlo_loss, self.q_eval_gmv_wrt_a, self._train_cost_op, self.cost_loss, self.cost_montecarlo_loss, self.q_eval_cost_wrt_a], feed_dict={ self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.s_: obs_next, self.done: done, self.return_gmv_value: gmv_returns, self.return_cost_value: cost_returns, self.return_value: returns, self.roi_thr: roi_thr }) return loss, montecarlo_loss, np.average(q_eval), np.average(returns), \ gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \ cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)
class DDPG_interface(LearningAgent, PIDAgent): def __init__( self, user_num, action_dim, action_bound, n_features, init_roi, budget, use_budget_control, use_prioritized_experience_replay, max_trajectory_length, update_times_per_train, ): PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=2) self.use_budget_control = use_budget_control self.user_num = user_num self.action_bound = action_bound self.action_dim = action_dim self.n_actions = 1 self.n_features = n_features self.gamma = 1. self.update_times_per_train = update_times_per_train self.lr = 0.001 self.epsilon = 0.9 self.epsilon_min = 0.1 self.epsilon_dec = 0.3 self.epsilon_dec_iter = 100 self.replace_target_iter = 300 self.soft_update_iter = 1 self.softupdate = True self.scope_name = "DDPG-model" self.epoch = 0 self.exploration_noise = OUNoise(self.action_dim) self.noise_weight = 1 self.noise_descrement_per_sampling = 0.0001 self.buffer_size = 20000 * max_trajectory_length self.batch_size = 512 self.alpha = 0.6 self.beta = 0.4 self.use_prioritized_experience_replay = use_prioritized_experience_replay if self.use_prioritized_experience_replay: self.prioritized_replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.alpha, max_priority=20.) else: self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True) with tf.variable_scope(self.scope_name): self._build_net() self.build_model_saver(self.scope_name) def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') self.r_gmv = tf.placeholder(tf.float32, [ None, ], name='r_gmv') self.r_cost = tf.placeholder(tf.float32, [ None, ], name='r_cost') self.r = tf.placeholder(tf.float32, [ None, ], name='r') self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr") self.a = tf.placeholder(tf.float32, [ None, ], name='a') self.done = tf.placeholder(tf.float32, [ None, ], name='done') self.gmv_return_value = tf.placeholder(tf.float32, [ None, ], name='gmv_return') self.cost_return_value = tf.placeholder(tf.float32, [ None, ], name='cost_return') self.return_value = tf.placeholder(tf.float32, [ None, ], name='return') self.important_sampling_weight_ph = tf.placeholder( tf.float32, [None], name="important_sampling_weight") self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net") self.a_target = self._build_action_net( self.s_, variable_scope="actor_target_net") self.gmv_critic_eval = self._build_q_net( self.s, self.a, variable_scope="gmv_critic_eval_net") self.gmv_critic_eval_for_loss = self._build_q_net( self.s, self.a_eval, variable_scope="gmv_critic_eval_net", reuse=True) self.gmv_critic_target = self._build_q_net( self.s_, self.a_target, variable_scope="gmv_critic_target_net") self.cost_critic_eval = self._build_q_net( self.s, self.a, variable_scope="cost_critic_eval_net") self.cost_critic_eval_for_loss = self._build_q_net( self.s, self.a_eval, variable_scope="cost_critic_eval_net", reuse=True) self.cost_critic_target = self._build_q_net( self.s_, self.a_target, variable_scope="cost_critic_target_net") self.critic_eval = self.gmv_critic_eval - self.roi_thr * self.cost_critic_eval self.critic_eval_for_loss = self.gmv_critic_eval_for_loss - self.roi_thr * self.cost_critic_eval_for_loss self.critic_target = self.gmv_critic_target - self.roi_thr * self.cost_critic_target ae_params = scope_vars(absolute_scope_name("actor_eval_net")) at_params = scope_vars(absolute_scope_name("actor_target_net")) gmv_ce_params = scope_vars(absolute_scope_name("gmv_critic_eval_net")) gmv_ct_params = scope_vars( absolute_scope_name("gmv_critic_target_net")) cost_ce_params = scope_vars( absolute_scope_name("cost_critic_eval_net")) cost_ct_params = scope_vars( absolute_scope_name("cost_critic_target_net")) print(ae_params) print(at_params) print(gmv_ce_params) print(gmv_ct_params) print(cost_ce_params) print(cost_ct_params) with tf.variable_scope('hard_replacement'): self.a_target_replace_op = tf.group( [tf.assign(t, e) for t, e in zip(at_params, ae_params)]) self.gmv_c_target_replace_op = tf.group([ tf.assign(t, e) for t, e in zip(gmv_ct_params, gmv_ce_params) ]) self.cost_c_target_replace_op = tf.group([ tf.assign(t, e) for t, e in zip(cost_ct_params, cost_ce_params) ]) with tf.variable_scope('soft_update'): self.a_update_target_q = self.__make_update_exp__( ae_params, at_params) self.gmv_c_update_target_q = self.__make_update_exp__( gmv_ce_params, gmv_ct_params) self.cost_c_update_target_q = self.__make_update_exp__( cost_ce_params, cost_ct_params) with tf.variable_scope('q_target'): self.td0_gmv_q_target = tf.stop_gradient(self.r_gmv + self.gamma * (1. - self.done) * self.gmv_critic_target) self.td0_cost_q_target = tf.stop_gradient(self.r_cost + self.gamma * (1. - self.done) * self.cost_critic_target) self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target) self.montecarlo_gmv_target = self.gmv_return_value self.montecarlo_cost_target = self.cost_return_value self.montecarlo_target = self.return_value with tf.variable_scope('loss'): self._build_loss() self._pick_loss() with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss, var_list=gmv_ce_params + cost_ce_params) self._train_gmv_c_op = tf.train.AdamOptimizer(self.lr).minimize( self.gmv_loss, var_list=gmv_ce_params) self._train_cost_c_op = tf.train.AdamOptimizer(self.lr).minimize( self.cost_loss, var_list=cost_ce_params) self._train_a_op = tf.train.AdamOptimizer(self.lr).minimize( self.actor_loss, var_list=ae_params) with tf.variable_scope('roi'): self.max_longterm_roi = self.gmv_critic_eval / ( self.cost_critic_eval + 1e-4) def _pick_loss(self): self.has_target_net = True self.loss = self.td_loss self.gmv_loss = self.gmv_td_loss self.cost_loss = self.cost_td_loss self.actor_loss = self.a_loss self.priority_values = self.montecarlo_gmv_error + self.montecarlo_cost_error def _build_loss(self): if self.use_prioritized_experience_replay: self.gmv_td_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.td0_gmv_q_target, self.gmv_critic_eval, name='TD0_gmv_loss')) self.cost_td_loss = tf.reduce_mean( self.important_sampling_weight_ph * tf.squared_difference(self.td0_cost_q_target, self.cost_critic_eval, name='TD0_cost_loss')) else: self.gmv_td_loss = tf.reduce_mean( tf.squared_difference(self.td0_gmv_q_target, self.gmv_critic_eval, name='TD0_gmv_loss')) self.cost_td_loss = tf.reduce_mean( tf.squared_difference(self.td0_cost_q_target, self.cost_critic_eval, name='TD0_cost_loss')) self.td_loss = tf.reduce_mean( tf.squared_difference(self.td0_q_target, self.critic_eval, name='TD0_loss')) self.a_loss = -tf.reduce_mean(self.critic_eval_for_loss) self.gmv_montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_gmv_target, self.gmv_critic_eval, name='MonteCarlo_gmv_error')) self.cost_montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_cost_target, self.cost_critic_eval, name='MonteCarlo_cost_error')) self.montecarlo_loss = tf.reduce_mean( tf.squared_difference(self.montecarlo_target, self.critic_eval, name='MonteCarlo_error')) self.td0_gmv_error = tf.abs(self.td0_gmv_q_target - self.gmv_critic_eval) self.td0_cost_error = tf.abs(self.td0_cost_q_target - self.cost_critic_eval) self.td0_error = tf.abs(self.td0_q_target - self.critic_eval) self.montecarlo_gmv_error = tf.abs(self.montecarlo_gmv_target - self.gmv_critic_eval) self.montecarlo_cost_error = tf.abs(self.montecarlo_cost_target - self.cost_critic_eval) self.montecarlo_error = tf.abs(self.montecarlo_target - self.critic_eval) def _build_q_net(self, state, action, variable_scope, reuse=False): with tf.variable_scope(variable_scope, reuse=reuse): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] state = tf.concat( [state, tf.expand_dims(action, axis=1, name="2d-action")], axis=1) fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2') q = tf.layers.dense(fc2, units=self.action_dim, name='q') return q[:, 0] def _build_action_net(self, state, variable_scope): with tf.variable_scope(variable_scope): user_id_embedding_table = tf.get_variable( name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(), trainable=True, dtype=tf.float32) user_id = tf.cast(state[:, 0], dtype=tf.int32) user_id_embeddings = tf.nn.embedding_lookup( user_id_embedding_table, ids=user_id, name="user_id_embedding") state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1) n_features = state.get_shape()[1] fc1 = tf.layers.dense(state, units=n_features // 2, activation=tf.nn.relu, name='fc1') actions = tf.layers.dense(fc1, self.action_dim, activation=tf.nn.sigmoid, name='a') scaled_a = tf.multiply(actions, 1, name='scaled_a') return scaled_a[:, 0] def __make_update_exp__(self, vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return expression def __make_hardreplace_exp__(self, vals, target_vals): expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(var)) expression = tf.group(*expression) return expression def build_model_saver(self, var_scope): var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope) self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3) def save(self, sess, path, step): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.model_saver.save(sess, save_path=path, global_step=step) def restore(self, sess, path): self.model_saver.restore(sess, save_path=path) print('%s model reloaded from %s' % (self.scope_name, path)) def experience(self, new_trajectory, other_info=None): new_trajectory_gmv = other_info["gmv"] new_trajectory_cost = other_info["cost"] if self.use_prioritized_experience_replay: add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma) else: add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma) add_episode(self.gmv_replay_buffer, new_trajectory_gmv, gamma=self.gamma) add_episode(self.cost_replay_buffer, new_trajectory_cost, gamma=self.gamma) def __epsilon_greedy__(self, sess, observation, roi_thr): if np.random.uniform() < self.epsilon: observation = observation[np.newaxis, :] actions_value = sess.run(self.a_eval, feed_dict={ self.s: observation, self.roi_thr: roi_thr }) action_noise = self.exploration_noise.noise() bid = actions_value + action_noise bid = bid[0] else: bid = self.__greedy__(sess, observation, roi_thr) return bid def __greedy__(self, sess, observation, roi_thr): observation = observation[np.newaxis, :] bid = sess.run(self.a_eval, feed_dict={ self.s: observation, self.roi_thr: roi_thr }) return bid[0] def choose_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi return self.__epsilon_greedy__(sess, observation, roi_thr) def greedy_action(self, sess, observation, other_info): if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi bid = self.__greedy__(sess, observation, roi_thr) if self.use_budget_control: user_idx = other_info["user_idx"] request_idx = other_info["request_idx"] roi_threshold = self.get_roi_threshold() if request_idx == 0: observations = observation[np.newaxis, :] max_plongterm_roi = sess.run(self.max_longterm_roi, feed_dict={ self.s: observations, self.a: [bid] }) if max_plongterm_roi >= roi_threshold: self.explore_user(user_idx) return bid else: return 0. else: if self.is_user_selected(user_idx): return bid else: return 0 else: return bid def get_action(self, sess, obs, is_test=False, other_info=None): if is_test: discrete_action = self.greedy_action(sess, obs, other_info) else: discrete_action = self.choose_action(sess, obs, other_info) other_action_info = {"learning_action": discrete_action} return self.action_bound * np.clip(discrete_action, 0, 1), other_action_info def get_memory_returns(self): if self.use_prioritized_experience_replay: return self.prioritized_replay_buffer.current_mean_return else: return self.replay_buffer.current_mean_return def _is_exploration_enough(self, min_pool_size): if self.use_prioritized_experience_replay: return len(self.prioritized_replay_buffer) >= min_pool_size else: return len(self.replay_buffer) >= min_pool_size def update_target(self, sess): if self.softupdate: if self.epoch % self.soft_update_iter == 0: sess.run(self.gmv_c_update_target_q) sess.run(self.cost_c_update_target_q) sess.run(self.a_update_target_q) else: if self.epoch % self.replace_target_iter == 0: sess.run(self.gmv_c_update_target_q) sess.run(self.cost_c_update_target_q) sess.run(self.a_target_replace_op) def train(self, sess): if self.has_target_net: self.update_target(sess) self.epoch += 1 if not self._is_exploration_enough(self.batch_size): return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0 if self.use_prioritized_experience_replay: policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess) else: policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \ gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \ cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess) if self.epoch % self.epsilon_dec_iter == 0: self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min) print("update epsilon:", self.epsilon) return True, [ policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns ], self.get_memory_returns(), self.epsilon def train_prioritized(self, sess): loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0 for idx in range(self.update_times_per_train): sample_indices = self.prioritized_replay_buffer.make_index( self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index( sample_indices) _, loss, q_eval, montecarlo_loss, priority_values = sess.run( [ self._train_c_op, self.loss, self.critic_eval, self.montecarlo_loss, self.priority_values ], feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, self.important_sampling_weight_ph: weights }) sess.run(self._train_a_op, feed_dict={ self.s: obs, self.a: act, self.r: rew, self.s_: obs_next, self.done: done, self.return_value: returns, self.important_sampling_weight_ph: weights }) priorities = priority_values + 1e-6 self.prioritized_replay_buffer.update_priorities( sample_indices, priorities) return loss, montecarlo_loss, np.average(q_eval), np.average(returns) def train_normal(self, sess): policy_loss, policy_entropy = 0, 0 loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0 gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0 cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0 if self.use_budget_control: roi_thr = self.get_roi_threshold() else: roi_thr = self.init_roi for idx in range(self.update_times_per_train): sample_indices = self.replay_buffer.make_index(self.batch_size) obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index( sample_indices) obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index( sample_indices) obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index( sample_indices) _, loss, montecarlo_loss, q_eval, \ _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \ _2, cost_loss, cost_montecarlo_loss, cost_q_eval \ = sess.run( [self._train_op, self.loss, self.montecarlo_loss, self.critic_eval, self._train_gmv_c_op, self.gmv_loss, self.gmv_montecarlo_loss, self.gmv_critic_eval, self._train_cost_c_op, self.cost_loss, self.cost_montecarlo_loss, self.cost_critic_eval], feed_dict={ self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.r: rew, self.s_: obs_next, self.done: done, self.gmv_return_value: gmv_returns, self.cost_return_value: cost_returns, self.return_value: returns, self.roi_thr: roi_thr }) _, actor_loss = sess.run( [self._train_a_op, self.actor_loss], feed_dict={ self.roi_thr: roi_thr, self.s: obs, self.a: act, self.r_gmv: rew_gmv, self.r_cost: rew_cost, self.s_: obs_next, self.done: done, self.gmv_return_value: gmv_returns, self.cost_return_value: cost_returns, }) return 0, 0, loss, montecarlo_loss, np.average(q_eval), np.average(returns), \ gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \ cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)
class DDQNAgentCnn(GeneralAgent): def __init__(self, gamma, action_number, minibatch, episodes, begin_train, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, plots_to_save, episode_steps, episode_to_save, max_buffer_len, model_type ): super().__init__(gamma=gamma, action_number=action_number, path_to_load=path_to_load, path_to_save=path_to_save, plots_to_save=plots_to_save, load_model=load_model, episode_to_save=episode_to_save, episodes=episodes, model_type=model_type) # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch # Episode Params self.begin_train = begin_train self.copy_step = copy_step self.episode_steps = episode_steps # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.target_model = model_type(action_number).to(self.device) self.update_target() # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], [] self.losses = [] self.periodic_reward = 0 self.periodic_rewards = [] def update_target(self): self.target_model.load_state_dict(self.model.state_dict()) def reduce_epsilon(self, episode): self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * episode / self.epsilon_delta) def epsilon_greedy(self): if (1 - self.epsilon) <= np.random.random(): self.action = np.random.randint(self.action_number) else: state = torch.autograd.Variable(torch.FloatTensor(self.state).to(self.device).unsqueeze(0)) self.action = self.model(state).max(1)[1].item() return self.action @staticmethod def preprocess_observation(obs): img = resize(rgb2gray(obs[0:188, 23:136, :]), (28, 28), mode='constant') img = img.reshape(1, 28, 28) return img def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done): return \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \ torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device)) def train_model(self): o_state, o_act, o_reward, o_next_state, o_done = \ self.transition_process(*self.replay_buffer.sample(self.minibatch)) q = self.model(o_state).gather(1, o_act.unsqueeze(1)).squeeze(1) q_next = self.target_model(o_next_state) y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done) loss = (q - y_hat.detach()).pow(2).mean() self.model.optimizer.zero_grad() loss.backward() self.model.optimizer.step() return loss def init_new_episode(self, env): observation = env.reset() self.state = self.preprocess_observation(observation) def episode_check(self, episode, loss): if episode % self.copy_step == 0: self.losses.append(loss) self.update_target() if episode % self.episode_steps == 0: self.periodic_rewards.append(self.periodic_reward / self.episode_steps) self.periodic_reward = 0 if episode % self.episode_to_save == 0: torch.save(self.model.state_dict(), self.path_to_save) fig = plt.figure() plt.plot(self.rewards) fig.savefig(self.plots_to_save + '_reward.png') plt.close(fig) fig = plt.figure() plt.plot(self.losses) fig.savefig(self.plots_to_save + '_loss.png') plt.close(fig) fig = plt.figure() plt.plot(self.periodic_rewards) fig.savefig(self.plots_to_save + '_periodic_reward.png') plt.close(fig) def train(self, env: gym.wrappers.time_limit.TimeLimit): self.init_new_episode(env) total_reward = 0 episode_reward = 0 loss = 0 for episode in self.trangle: self.trangle.set_description( f"Episode: {episode} | Episode Reward {episode_reward} | Periodic reward " f"{self.periodic_reward / self.episode_steps} | Average Reward {total_reward / (episode + 1)}" ) self.trangle.refresh() action = self.epsilon_greedy() next_observation, reward, done, _ = env.step(action) total_reward += reward episode_reward += reward self.periodic_reward += reward next_state = self.preprocess_observation(next_observation) self.replay_buffer.push(self.state, action, reward, next_state, done) self.state = next_state if len(self.replay_buffer) >= self.begin_train: loss = self.train_model() self.reduce_epsilon(episode) self.episode_check(episode, loss) if done: self.init_new_episode(env) self.rewards.append(episode_reward) episode_reward = 0