def __init__(
        self,
        user_num,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        max_trajectory_length,
        update_times_per_train=1,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=1)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = 1
        self.n_features = n_features
        self.lr = 0.001

        self.scope_name = "MyopicGreedy-model"

        self.epoch = 0

        self.buffer_size = 1000 * max_trajectory_length
        self.batch_size = 512
        self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=False)

        with tf.variable_scope(self.scope_name):
            self._build_net()

            self.build_model_saver(self.scope_name)
示例#2
0
文件: Agent.py 项目: karunraju/NFF
    def __init__(self, render=False, method='Duel'):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.epsilon = 0.5
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.df = PARAM.DISCOUNT_FACTOR  # Discount Factor
        self.batch_size = PARAM.BATCH_SIZE
        self.method = method
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0
        self.prioritized_replay = PARAM.PRIORITIZED_REPLAY
        self.prioritized_replay_eps = 1e-6
        #self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_alpha = 0.8
        self.prioritized_replay_beta0 = 0.4
        self.burn_in = PARAM.BURN_IN

        # Create Replay Memory and initialize with burn_in transitions
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha)
            self.beta_schedule = LinearSchedule(
                float(self.training_time),
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
            self.beta_schedule = None

        # Create QNetwork instance
        if self.method == 'Duel':
            print('Using Duel Network.')
            self.net = DuelQNetwork(self.an)
        elif self.method == 'DoubleQ':
            print('Using DoubleQ Network.')
            self.net = DoubleQNetwork(self.an)
        else:
            raise NotImplementedError

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')
示例#3
0
    def __init__(self, user_num, action_dim, action_bound, cvr_n_features, ddpg_n_features, init_roi, budget,
                 use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.n_actions = 1
        self.cvr_n_features = cvr_n_features
        self.ddpg_n_features = ddpg_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.9
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.3
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = True

        self.scope_name = "CDDPG-model"

        self.epoch = 0

        self.exploration_noise = OUNoise(self.action_dim)

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ddpg_buffer_size = 1000 * max_trajectory_length

        self.ddpg_batch_size = 256
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ddpg_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ddpg_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
示例#4
0
    def __init__(self, gamma, action_number, minibatch, episodes, begin_train,
                 train_step, begin_copy, copy_step, epsilon_delta,
                 epsilon_start, epsilon_end, load_model, path_to_load,
                 path_to_save, episode_steps, episode_to_save, max_buffer_len):

        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch
        self.action_number = action_number
        self.gamma = gamma

        # Episode Params

        self.begin_train = begin_train
        self.begin_copy = begin_copy
        self.copy_step = copy_step
        self.train_step = train_step
        self.episodes = episodes
        self.episode_steps = episode_steps
        self.episode_to_save = episode_to_save

        # I/O params

        self.path_to_load = path_to_load
        self.path_to_save = path_to_save
        self.load_model = load_model

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        self.model = BoxModel((150, 100, 1), action_number).to(self.device)
        if self.load_model:
            self.model.load_state_dict(torch.load(self.path_to_load))

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []
示例#5
0
    def __init__(self, render=False):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.method = 'PPO'
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0

        self.burn_in = PARAM.BURN_IN
        self.tmax = PARAM.A2C_EPISODE_SIZE_MAX
        self.tmin = PARAM.A2C_EPISODE_SIZE_MIN
        self.seq_len = PARAM.A2C_SEQUENCE_LENGTH
        self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
        self.episode_buffer = [[]] * self.tmax
        self.net = PPO(self.episode_buffer, self.replay_buffer)

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

        self.curr_state = self.env.reset()
        self.tong_count = 0
        self.curr_state = self.burn_in_memory(self.curr_state)
        self.train_rewards = []
        self.test_rewards = []
        self.steps = 0
        self.cum_reward = 0.0
        self.save_count = 0
示例#6
0
    def __init__(
        self,
        n_actions=11,
        n_features=29,
        use_prioritized_experience_replay=True,
        max_trajectory_length=20,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.gamma = 1.

        self.lr = 0.001
        self.epsilon = 0.5
        self.epsilon_min = 0
        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 1000
        self.replace_target_iter = 100
        self.soft_update_iter = 1
        self.softupdate = False
        self.scope_name = "DQN-model"

        self.epoch = 0

        self.buffer_size = 5000 * max_trajectory_length
        self.batch_size = 512
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        self.margin_constant = 2

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
示例#7
0
    def __init__(self, user_num, n_actions, cvr_n_features, ppo_n_features, init_roi, budget, use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = n_actions
        self.action_dim = 1
        self.cvr_n_features = cvr_n_features
        self.ppo_n_features = ppo_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.4
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.epsilon_clip = 0.2
        self.lam = 0.5
        self.update_step = 1
        self.kl_target = 0.01
        self.gamma = 1.
        self.method = 'clip'

        self.policy_logvar = 1e-7

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = False

        self.scope_name = "CPPO-model"

        self.epoch = 0

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ppo_buffer_size = 1000 * max_trajectory_length

        self.ppo_batch_size = 250
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ppo_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ppo_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
示例#8
0
class ConstrainedPPO(CMDPAgent):

    def init_parameters(self, sess):
        if self.has_target_net:
            super(CMDPAgent, self).init_parameters(sess)

            sess.run(self.a_target_replace_op)

    def __init__(self, user_num, n_actions, cvr_n_features, ppo_n_features, init_roi, budget, use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = n_actions
        self.action_dim = 1
        self.cvr_n_features = cvr_n_features
        self.ppo_n_features = ppo_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.4
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.epsilon_clip = 0.2
        self.lam = 0.5
        self.update_step = 1
        self.kl_target = 0.01
        self.gamma = 1.
        self.method = 'clip'

        self.policy_logvar = 1e-7

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = False

        self.scope_name = "CPPO-model"

        self.epoch = 0

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ppo_buffer_size = 1000 * max_trajectory_length

        self.ppo_batch_size = 250
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ppo_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ppo_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())

            fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr',
                                                 kernel_initializer=initializers.xavier_initializer()))
            return cvr_out

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())
            fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            a_prob = tf.layers.dense(fc3, self.n_actions, tf.nn.softmax,
                                     kernel_initializer=initializers.xavier_initializer())
        return a_prob

    def _build_q_net(self, state, variable_scope, reuse=False):

        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())
            fc3 = tf.layers.dense(fc2, units=n_features // 4, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            v = tf.layers.dense(fc3, 1, kernel_initializer=initializers.xavier_initializer())
        return v[:, 0]

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ppo_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.int32, [None, ], name='a')
        self.adv = tf.placeholder(tf.float32, [None, ], name='advantage')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net")
        self.critic = self._build_q_net(self.s, variable_scope="eval_q_net")

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))
        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ppo_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss)
            self._train_ppo_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss)

    def _pick_loss(self):
        self.has_target_net = True
        self.critic_loss = self.closs

        self.actor_loss = self.aloss

    def _build_loss(self):
        with tf.variable_scope('critic'):
            self.c_loss = self.return_value - self.critic
            self.closs = tf.reduce_mean(tf.square(self.c_loss))

            self.advantage = self.return_value - self.critic

        with tf.variable_scope('surrogate'):

            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            pi_prob = tf.gather_nd(params=self.a_eval, indices=a_indices)
            oldpi_prob = tf.gather_nd(params=self.a_target, indices=a_indices)
            ratio = pi_prob / (oldpi_prob + 1e-8)
            surr = ratio * self.adv
            if self.method == 'kl_pen':

                kl = tf.distributions.kl_divergence(self.a_target, self.a_eval)
                self.kl_mean = tf.reduce_mean(kl)
                self.aloss = -(tf.reduce_mean(surr - self.lam * kl))
            else:
                self.aloss = -tf.reduce_mean(tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - self.epsilon_clip, 1. + self.epsilon_clip) * self.adv))

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def experience_cmdp(self, new_trajectory, other_info=None):
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

    def get_agent_name(self):
        return self.scope_name

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        roi_thr = other_info["roi_thr"]

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={
            self.s_cvr: observations
        })[0]
        if self.use_predict_cvr:
            bid = cvr * item_price / roi_thr
        else:
            bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def get_cmdp_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.__greedy__(sess, obs)
        else:
            discrete_action = self.__epsilon_greedy__(sess, obs)

        return discrete_action

    def __greedy__(self, sess, observation):
        s = observation[np.newaxis, :]

        prob_weights = sess.run(self.a_eval, feed_dict={self.s: s})
        greedy_action = np.argmax(prob_weights, axis=1)[0]

        return greedy_action

    def __epsilon_greedy__(self, sess, observation):
        if np.random.uniform() < self.epsilon:

            action = np.random.randint(0, self.n_actions)
        else:
            action = self.__greedy__(sess, observation)
        return action

    def _is_exploration_enough(self, buffer, min_pool_size):
        return len(buffer) >= min_pool_size

    def train_cvr(self, sess):
        if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_cvr_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s_cvr: obs,
                    self.cvr: cvr_targets
                }
            )
        return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)]

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def update_target(self, sess):
        if self.epoch % self.replace_target_iter == 0:
            sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)
        self.epoch += 1

        buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer
        if not self._is_exploration_enough(buffer, self.ppo_batch_size):
            return False, [0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess)
        else:

            loss, montecarlo_loss, q_eval, returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)

            print("update epsilon:", self.epsilon)
        return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(self.ppo_batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, montecarlo_loss, q_eval, \
            priority_values = sess.run(
                [self._train_ppo_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a,
                 self.priority_values],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights,
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):

            sample_indices = self.replay_buffer.make_index(self.ppo_batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            adv = sess.run(self.advantage, {self.s: obs, self.return_value: returns})

            _, montecarlo_loss, q_eval = sess.run(
                [self._train_ppo_critic_op, self.critic_loss, self.critic],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.adv: adv,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })
            if self.method == 'kl_pen':
                for _ in range(self.update_step):
                    _, kl, loss = sess.run(
                        [self._train_ppo_actor_op, self.kl_mean, self.actor_loss],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r: rew,
                            self.done: done,
                        })
                    if kl > 4 * self.kl_target:
                        break
                if kl < self.kl_target / 1.5:
                    self.lam /= 2
                elif kl > self.kl_target * 1.5:
                    self.lam *= 2
                self.lam = np.clip(self.lam, 1e-4, 10)
            else:

                for _ in range(self.update_step):
                    _, loss = sess.run(
                        [self._train_ppo_actor_op, self.actor_loss],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r: rew,
                            self.done: done,
                            self.return_value: returns,

                        })

        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
示例#9
0
class Agent_ppo():
    def __init__(self, render=False):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.method = 'PPO'
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0

        self.burn_in = PARAM.BURN_IN
        self.tmax = PARAM.A2C_EPISODE_SIZE_MAX
        self.tmin = PARAM.A2C_EPISODE_SIZE_MIN
        self.seq_len = PARAM.A2C_SEQUENCE_LENGTH
        self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
        self.episode_buffer = [[]] * self.tmax
        self.net = PPO(self.episode_buffer, self.replay_buffer)

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

        self.curr_state = self.env.reset()
        self.tong_count = 0
        self.curr_state = self.burn_in_memory(self.curr_state)
        self.train_rewards = []
        self.test_rewards = []
        self.steps = 0
        self.cum_reward = 0.0
        self.save_count = 0

    def generate_episode(self, tmax, render=False):
        #for i in range(tmax):
        ctr, i = (0, 0)
        self.her_reward_buffer = np.zeros(tmax)
        her_reward = 0
        while ctr < tmax:
            if i % PARAM.ACTION_REPEAT == 0:
                val, softmax, action = self.net.get_output(
                    [ctr - 1], seq_len=self.seq_len, batch_size=1)
            else:
                action = 0

            next_state, reward, _, _ = self.env.step(action)
            if render:
                self.env.render()

            if PARAM.REWARD_SHAPING:
                psuedo_reward = self.compute_psuedo_reward(
                    next_state['vision'])
            else:
                psuedo_reward = 0.0

            tong_reward = 0.0
            if reward == 0:
                if self.curr_state['vision'][5, 6, 0] == 1.0:
                    self.tong_count += 1
                    if PARAM.REWARD_SHAPING:
                        tong_reward = 10.0
            elif reward == 100.0:
                self.tong_count -= 1

            her_reward += reward
            if i % PARAM.ACTION_REPEAT == 0:
                self.episode_buffer[ctr] = (self.curr_state, action,
                                            ((reward + tong_reward) / 100.0 +
                                             psuedo_reward), next_state,
                                            softmax, self.tong_count, val)
                self.her_reward_buffer[ctr] = her_reward
                her_reward = 0
                ctr += 1
            self.replay_buffer.add(self.curr_state, action, reward / 100.0,
                                   next_state, 0, self.tong_count)
            self.curr_state = next_state

            i += 1
            self.steps += 1
            self.cum_reward += reward
            if self.steps % 100 == 0:
                self.plot_train_stats()

    def compute_psuedo_reward(self, vision):
        avg = np.mean(vision[3:8, 3:8, :], axis=2)
        idxs = avg == 0.5
        avg[idxs] = 0.0
        reward = np.sum(avg) - 1.0 / 3.0
        if reward < 0.001:
            return 0.0

        return reward

    def hind_sight_experience_replay(self, episode_len):
        her_reward = 0
        her_decay = PARAM.HER_DECAY
        for i in range(episode_len - 1, -1, -1):
            obs, action, reward, next_obs, softmax, tong_count, val = self.episode_buffer[
                i]
            self.episode_buffer[i] = (
                obs, action,
                (self.her_reward_buffer[i] + her_reward * her_decay) / 100.0,
                next_obs, softmax, tong_count, val)
            her_reward = her_reward * her_decay + self.her_reward_buffer[i]

    def train(self):
        for i in range(self.training_time):
            self.net.set_train()
            episode_len = np.random.randint(self.tmin, self.tmax + 1)
            self.generate_episode(episode_len, self.render)
            if PARAM.HER:
                self.hind_sight_experience_replay(episode_len)
            self.net.train(episode_len)
            self.save_count += 1

    def test(self, testing_steps=100, model_file=None):
        if model_file is not None:
            self.net.load_model(model_file)

        self.net.set_eval()
        cum_reward = 0.0
        for i in range(testing_steps):
            softmax, action = self.net.get_output(self.curr_state, i)
            _, reward, _, _ = self.test_env.step(action)
            cum_reward += reward

        self.test_reward.append(cum_reward)
        self.test_file.write(str(test_rewards[-1]))
        self.test_file.write('\n')
        self.test_file.flush()
        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
        test_steps = 0

        x = list(range(len(test_rewards)))
        plt.plot(x, self.test_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Testing Curve')
        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
        plt.close()

    def plot_train_stats(self):
        self.cum_reward = self.cum_reward / float(self.log_time)
        self.train_rewards.append(self.cum_reward)
        self.train_file.write(str(self.cum_reward))
        self.train_file.write('\n')
        self.train_file.flush()
        self.cum_reward = 0.0
        if self.train_rewards[-1] > 0:
            self.net.A.save("checkpoint.pth")
            print('[%d] Train Reward: %.4f' %
                  (len(self.train_rewards), self.train_rewards[-1]))
        self.steps = 0

        x = list(range(len(self.train_rewards)))
        plt.plot(x, self.train_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Training Curve')
        plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png')
        plt.close()

        plot(self.dump_dir + self.method, self.train_rewards)


#    if self.save_count > 0 and self.save_count % 500 == 0:
#      self.net.save_model_weights(self.save_count, self.dump_dir)

    def burn_in_memory(self, curr_state):
        # Initialize your replay memory with a burn_in number of episodes / transitions.
        cnt = 0
        while self.burn_in > cnt:
            action = self.env.action_space.sample()
            next_state, reward, _, _ = self.env.step(action)
            if reward == 20.0:
                self.tong_count += 1
            elif reward == 100.0:
                self.tong_count -= 1
            self.replay_buffer.add(curr_state, action, reward / 100.0,
                                   next_state, 0, self.tong_count)
            curr_state = next_state

            cnt = cnt + 1
        return curr_state
示例#10
0
    def __init__(
        self,
        user_num,
        n_actions,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        use_prioritized_experience_replay,
        max_trajectory_length,
        update_times_per_train=1,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=2)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = n_actions
        self.n_features = n_features
        self.gamma = 1.
        self.lr = 0.001

        self.user_based_adjust_times = 40

        self.epsilon = 0.4
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = True

        self.scope_name = "DQN-model"

        self.epoch = 0

        self.buffer_size = 1000 * max_trajectory_length

        self.batch_size = 512
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size,
                                               save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        self.margin_constant = 2

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
示例#11
0
    def __init__(self,
                 gamma,
                 action_number,
                 minibatch,
                 episodes,
                 begin_train,
                 copy_step,
                 epsilon_delta,
                 epsilon_start,
                 epsilon_end,
                 load_model,
                 path_to_load,
                 path_to_save,
                 plots_to_save,
                 episode_steps,
                 episode_to_save,
                 max_buffer_len,
                 model_type
                 ):

        super().__init__(gamma=gamma,
                         action_number=action_number,
                         path_to_load=path_to_load,
                         path_to_save=path_to_save,
                         plots_to_save=plots_to_save,
                         load_model=load_model,
                         episode_to_save=episode_to_save,
                         episodes=episodes,
                         model_type=model_type)
        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch

        # Episode Params

        self.begin_train = begin_train
        self.copy_step = copy_step
        self.episode_steps = episode_steps

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.target_model = model_type(action_number).to(self.device)
        self.update_target()

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []
        self.losses = []
        self.periodic_reward = 0
        self.periodic_rewards = []
示例#12
0
class ConstrainedDDPG(CMDPAgent):

    def init_parameters(self, sess):
        if self.has_target_net:
            super(CMDPAgent, self).init_parameters(sess)
            sess.run(self.target_replace_op)
            sess.run(self.a_target_replace_op)

    def __init__(self, user_num, action_dim, action_bound, cvr_n_features, ddpg_n_features, init_roi, budget,
                 use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train=1, use_predict_cvr=False):
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.n_actions = 1
        self.cvr_n_features = cvr_n_features
        self.ddpg_n_features = ddpg_n_features
        self.lr = 0.001
        self.use_predict_cvr = use_predict_cvr

        self.user_based_adjust_times = 40
        self.epsilon = 0.9
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.3
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = True

        self.scope_name = "CDDPG-model"

        self.epoch = 0

        self.exploration_noise = OUNoise(self.action_dim)

        self.cvr_buffer_size = 1000 * max_trajectory_length
        self.cvr_batch_size = 512
        self.cvr_replay_buffer = ReplayBuffer(self.cvr_buffer_size, save_return=False)

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.ddpg_buffer_size = 1000 * max_trajectory_length

        self.ddpg_batch_size = 256
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.ddpg_buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.ddpg_buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1',
                                  kernel_initializer=initializers.xavier_initializer())

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2',
                                  kernel_initializer=initializers.xavier_initializer())

            fc3 = tf.layers.dense(fc2, units=n_features // 2, activation=tf.nn.relu, name='fc3',
                                  kernel_initializer=initializers.xavier_initializer())
            cvr_out = tf.sigmoid(tf.layers.dense(fc3, units=1, name='cvr',
                                                 kernel_initializer=initializers.xavier_initializer()))
            return cvr_out

    def _build_q_net(self, state, action, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            state = tf.concat([state, tf.expand_dims(action, axis=1, name="2d-action")], axis=1)
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1')

            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2')

            q = tf.layers.dense(fc2, units=self.action_dim, name='q')

            return q[:, 0]

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 10], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state, units=n_features, activation=tf.nn.relu, name='fc1')
            fc2 = tf.layers.dense(fc1, units=n_features // 2, activation=tf.nn.relu, name='fc2')

            actions = tf.layers.dense(fc2, self.action_dim, activation=tf.nn.sigmoid, name='a')

            return actions[:, 0]

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def _build_net(self):

        self.s_cvr = tf.placeholder(tf.float32, [None, self.cvr_n_features], name='s_cvr')
        self.cvr = tf.placeholder(tf.float32, [None, ], name='r')

        self.s = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.ddpg_n_features], name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.float32, [None, ], name='a')
        self.gamma = 1.
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.cvr_net = self._build_cvr_net(self.s_cvr, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]
        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s_, variable_scope="actor_target_net")
        self.critic_eval = self._build_q_net(self.s, self.a, variable_scope="eval_q_net")
        self.critic_eval_for_loss = self._build_q_net(self.s, self.a_eval, variable_scope="eval_q_net",
                                                      reuse=True)
        self.critic_target = self._build_q_net(self.s_, self.a, variable_scope="target_q_net")

        t_gmv_params = scope_vars(absolute_scope_name("target_q_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_q_net"))

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])
            self.target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])

        with tf.variable_scope('soft_update'):
            self.a_update_target_q = self.__make_update_exp__(ae_params, at_params)
            self.update_target_q = self.__make_update_exp__(e_gmv_params, t_gmv_params)

        with tf.variable_scope('q_target'):
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma * (1. - self.done) * self.critic_target)

            self.montecarlo_target = self.return_value

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(tf.squared_difference(self.predicted_cvr, self.cvr))

            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_cvr_op = tf.train.AdamOptimizer(self.lr).minimize(self.cvr_loss, var_list=cvr_params)
            self._train_ddpg_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, var_list=e_gmv_params)
            self._train_ddpg_a_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, var_list=ae_params)

    def _pick_loss(self):
        self.has_target_net = True

        self.loss = self.ddpg_loss
        self.priority_values = self.td0_error
        self.actor_loss = self.a_loss

    def _build_loss(self):

        if self.use_prioritized_experience_replay:

            self.ddpg_loss = tf.reduce_mean(
                self.important_sampling_weight_ph * tf.squared_difference(self.td0_q_target, self.critic_eval,
                                                                          name='TD0_loss'))

            self.montecarlo_loss = tf.reduce_mean(self.important_sampling_weight_ph *
                                                  tf.squared_difference(self.montecarlo_target, self.critic_eval,
                                                                        name='MonteCarlo_error'))

        else:

            self.ddpg_loss = tf.reduce_mean(tf.squared_difference(self.td0_q_target, self.critic_eval, name='TD0_loss'))

            self.montecarlo_loss = tf.reduce_mean(tf.squared_difference(self.montecarlo_target, self.critic_eval,
                                                                        name='MonteCarlo_error'))

        self.a_loss = - tf.reduce_mean(self.critic_eval_for_loss)

        self.td0_error = tf.abs(self.td0_q_target - self.critic_eval)

        self.montecarlo_error = tf.abs(self.montecarlo_target - self.critic_eval)

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.cvr_replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def experience_cmdp(self, new_trajectory, other_info=None):
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

    def get_agent_name(self):
        return self.scope_name

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        roi_thr = other_info["roi_thr"]

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={
            self.s_cvr: observations
        })[0]
        if self.use_predict_cvr:
            bid = cvr * item_price / roi_thr
        else:
            bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def get_cmdp_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.__greedy__(sess, obs)
        else:
            discrete_action = self.__epsilon_greedy__(sess, obs)

        return discrete_action

    def __greedy__(self, sess, observation):
        observation = observation[np.newaxis, :]
        greedy_action = sess.run(self.a_eval, feed_dict={self.s: observation})

        return greedy_action[0]

    def __epsilon_greedy__(self, sess, observation):
        if np.random.uniform() < self.epsilon:
            observation = observation[np.newaxis, :]
            actions_value = sess.run(self.a_eval, feed_dict={self.s: observation})

            action_noise = self.exploration_noise.noise()

            action = actions_value + action_noise

            action = action[0]


        else:
            action = self.__greedy__(sess, observation)
        return action

    def _is_exploration_enough(self, buffer, min_pool_size):
        return len(buffer) >= min_pool_size

    def train_cvr(self, sess):
        if not self._is_exploration_enough(self.cvr_replay_buffer, self.cvr_batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.cvr_replay_buffer.make_index(self.cvr_batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.cvr_replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_cvr_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s_cvr: obs,
                    self.cvr: cvr_targets
                }
            )
        return True, [cvr_loss, np.average(predicted_cvrs), np.average(cvr_targets)]

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def update_target(self, sess):
        if self.softupdate:

            if self.epoch % self.soft_update_iter == 0:
                sess.run(self.update_target_q)
                sess.run(self.a_update_target_q)
        else:

            if self.epoch % self.replace_target_iter == 0:
                sess.run(self.target_replace_op)
                sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)
        self.epoch += 1

        buffer = self.prioritized_replay_buffer if self.use_prioritized_experience_replay else self.replay_buffer
        if not self._is_exploration_enough(buffer, self.ddpg_batch_size):
            return False, [0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns = self.train_prioritized(sess)
        else:

            loss, montecarlo_loss, q_eval, returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)

            print("update epsilon:", self.epsilon)
        return True, [loss, montecarlo_loss, q_eval, returns], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(self.ddpg_batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, montecarlo_loss, q_eval, \
            priority_values = sess.run(
                [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval,
                 self.priority_values],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights,
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.ddpg_batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            _, loss, montecarlo_loss, q_eval = sess.run(
                [self._train_ddpg_critic_op, self.loss, self.montecarlo_loss, self.critic_eval],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })
            _, actor_loss = sess.run(
                [self._train_ddpg_a_op, self.actor_loss],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })

        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
示例#13
0
    def __init__(
        self,
        user_num,
        action_dim,
        action_bound,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        use_prioritized_experience_replay,
        max_trajectory_length,
        update_times_per_train,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=2)
        self.use_budget_control = use_budget_control
        self.user_num = user_num
        self.action_bound = action_bound
        self.action_dim = action_dim
        self.n_actions = 1
        self.n_features = n_features
        self.gamma = 1.
        self.update_times_per_train = update_times_per_train

        self.lr = 0.001

        self.epsilon = 0.9
        self.epsilon_min = 0.1
        self.epsilon_dec = 0.3
        self.epsilon_dec_iter = 100

        self.replace_target_iter = 300
        self.soft_update_iter = 1
        self.softupdate = True
        self.scope_name = "DDPG-model"

        self.epoch = 0

        self.exploration_noise = OUNoise(self.action_dim)
        self.noise_weight = 1
        self.noise_descrement_per_sampling = 0.0001

        self.buffer_size = 20000 * max_trajectory_length
        self.batch_size = 512

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size,
                                               save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
class ContextualBandit(PIDAgent, CvrAgent):
    def __init__(
        self,
        user_num,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        max_trajectory_length,
        update_times_per_train=1,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=1)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = 1
        self.n_features = n_features
        self.lr = 0.001

        self.scope_name = "MyopicGreedy-model"

        self.epoch = 0

        self.buffer_size = 1000 * max_trajectory_length
        self.batch_size = 512
        self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=False)

        with tf.variable_scope(self.scope_name):
            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_cvr_net(self, state, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id",
                shape=[self.user_num, 10],
                initializer=initializers.xavier_initializer(),
                trainable=True,
                dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(
                user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)
            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state,
                                  units=n_features,
                                  activation=tf.nn.relu,
                                  name='fc1')

            cvr_out = tf.sigmoid(tf.layers.dense(fc1, units=1, name='cvr'))
            return cvr_out

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.cvr = tf.placeholder(tf.float32, [
            None,
        ], name='r')

        self.cvr_net = self._build_cvr_net(self.s, variable_scope="cvr_net")
        self.predicted_cvr = self.cvr_net[:, 0]

        cvr_params = scope_vars(absolute_scope_name("cvr_net"))

        with tf.variable_scope('loss'):
            self.cvr_loss = tf.reduce_mean(
                tf.squared_difference(self.predicted_cvr, self.cvr))

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cvr_loss, var_list=cvr_params)

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        cvr_trajectory = other_info["cvr"]
        for ele in cvr_trajectory:
            state, cvr = ele
            self.replay_buffer.add(state, 0, cvr, state, 0, 0, 0)

    def get_action(self, sess, obs, is_test=False, other_info=None):
        item_price = other_info["proxy_ad_price"]
        ground_truth_cvr = other_info["cvr"]
        user_alpha = other_info["user_alpha"]
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        observations = obs[np.newaxis, :]
        cvr = sess.run(self.predicted_cvr, feed_dict={self.s: observations})[0]

        bid = ground_truth_cvr * item_price / roi_thr
        return bid, {"cvr_over_estimate": [user_alpha, ground_truth_cvr, cvr]}

    def _is_exploration_enough(self, min_pool_size):
        return len(self.replay_buffer) >= min_pool_size

    def train(self, sess):
        self.epoch += 1

        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0]

        cvr_loss, predicted_cvrs, cvr_targets = 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.batch_size)
            obs, act, cvr_targets, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)

            _, cvr_loss, predicted_cvrs = sess.run(
                [self._train_op, self.cvr_loss, self.predicted_cvr],
                feed_dict={
                    self.s: obs,
                    self.cvr: cvr_targets
                })
        return True, [
            cvr_loss,
            np.average(predicted_cvrs),
            np.average(cvr_targets)
        ]
示例#15
0
文件: Agent.py 项目: karunraju/NFF
class Agent():
    def __init__(self, render=False, method='Duel'):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.epsilon = 0.5
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.df = PARAM.DISCOUNT_FACTOR  # Discount Factor
        self.batch_size = PARAM.BATCH_SIZE
        self.method = method
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0
        self.prioritized_replay = PARAM.PRIORITIZED_REPLAY
        self.prioritized_replay_eps = 1e-6
        #self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_alpha = 0.8
        self.prioritized_replay_beta0 = 0.4
        self.burn_in = PARAM.BURN_IN

        # Create Replay Memory and initialize with burn_in transitions
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha)
            self.beta_schedule = LinearSchedule(
                float(self.training_time),
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
            self.beta_schedule = None

        # Create QNetwork instance
        if self.method == 'Duel':
            print('Using Duel Network.')
            self.net = DuelQNetwork(self.an)
        elif self.method == 'DoubleQ':
            print('Using DoubleQ Network.')
            self.net = DoubleQNetwork(self.an)
        else:
            raise NotImplementedError

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

    def update_epsilon(self):
        ''' Epsilon decay from 0.5 to 0.05 over 100000 iterations. '''
        if self.epsilon <= 0.05:
            self.epsilon = 0.05
            return

        self.epsilon = self.epsilon - (0.5 - 0.1) / 200000.0

    def epsilon_greedy_policy(self, q_values, epsilon):
        # Creating epsilon greedy probabilities to sample from.
        val = np.random.rand(1)
        if val <= epsilon:
            return np.random.randint(q_values.shape[1])
        return np.argmax(q_values)

    def greedy_policy(self, q_values):
        # Creating greedy policy for test time.
        return np.argmax(q_values)

    def train(self):
        train_rewards = []
        test_rewards = []
        count = 0
        steps = 0
        test_steps = 0

        cum_reward = 0.0
        elapsed = 0.0

        curr_state = self.env.reset()
        curr_state = self.burn_in_memory(curr_state)
        prev_action = -1
        if self.render:
            self.env.render()
        for i in range(self.training_time):
            # Get q_values based on the current state
            Vt, St = self.get_input_tensor(curr_state)
            q_values = self.net.get_Q_output(Vt, St)

            # Selecting an action based on the policy
            action = self.epsilon_greedy_policy(q_values, self.epsilon)
            #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1:
            #  action = self.epsilon_greedy_policy(q_values, 0.5)

            # Executing action in simulator
            nextstate, reward, _, _ = self.env.step(action)
            steps = steps + 1
            test_steps = test_steps + 1
            if self.render:
                self.env.render()

            # Store Transition
            if nextstate['moved'] or prev_action != action:
                self.replay_buffer.add(curr_state, action, reward / 100.0,
                                       nextstate, 0)
            prev_action = action

            # Sample random minibatch from experience replay
            if self.prioritized_replay:
                batch, weights, batch_idxes = self.replay_buffer.sample(
                    self.batch_size, beta=self.beta_schedule.value(i))
            else:
                batch = self.replay_buffer.sample(self.batch_size)
                weights, batch_idxes = np.ones(self.batch_size), None

            # Train the Network with mini batches
            xVT, xST = self.get_input_tensors(batch)
            yT = self.get_output_tensors(batch)

            # Mask to select the actions from the Q network output
            mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8)
            for k, tran in enumerate(batch):
                mT[k, tran[1]] = 1
            td_errors = self.net.train(xVT, xST, yT, mT, weights)

            if self.prioritized_replay:
                #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                #new_priorities = []
                #for i, tran in enumerate(batch):
                #  new_priorities.append(tran[2] + self.prioritized_replay_eps)
                self.replay_buffer.update_priorities(batch_idxes, weights)

            # Decay epsilon
            self.update_epsilon()

            cum_reward += reward
            curr_state = nextstate

            if steps == 100:
                cum_reward = cum_reward / float(self.log_time)
                train_rewards.append(cum_reward)
                self.train_file.write(str(cum_reward))
                self.train_file.write('\n')
                self.train_file.flush()
                cum_reward = 0.0
                print('Train Reward: %.4f' % (train_rewards[-1]))
                steps = 0

                x = list(range(len(train_rewards)))
                plt.plot(x, train_rewards, '-bo')
                plt.xlabel('Time')
                plt.ylabel('Average Reward')
                plt.title('Training Curve')
                plt.savefig(self.dump_dir + 'Training_Curve_' + self.method +
                            '.png')
                plt.close()

                plot(self.dump_dir + self.method, train_rewards)


#      if test_steps == 500:
#        self.net.set_eval()
#        test_rewards.append(self.test())
#        self.test_file.write(str(test_rewards[-1]))
#        self.test_file.write('\n')
#        self.test_file.flush()
#        self.net.set_train()
#        count = count + 1
#        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
#        test_steps = 0
#
#        x = list(range(len(test_rewards)))
#        plt.plot(x, test_rewards, '-bo')
#        plt.xlabel('Time')
#        plt.ylabel('Average Reward')
#        plt.title('Testing Curve')
#        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
#        plt.close()

            if count > 0 and count % 30 == 0:
                self.net.save_model_weights(count, self.dump_dir)

    def test(self, testing_steps=100, model_file=None, capture=False):
        if model_file is not None:
            self.net.load_model(model_file)

        if capture:
            self.test_env = gym.wrappers.Monitor(self.test_env, './')

        epsilon = 0.05
        rewards = []

        self.test_curr_state = self.test_env.reset()
        #if self.render:
        #  self.test_env.render()
        cum_reward = 0.0
        for i in range(testing_steps):
            # Initializing the episodes
            Vt, St = self.get_input_tensor(self.test_curr_state)
            q_values = self.net.get_Q_output(Vt, St)
            action = self.epsilon_greedy_policy(q_values, epsilon)

            # Executing action in simulator
            nextstate, reward, _, _ = self.test_env.step(action)
            #if self.render:
            #  self.test_env.render()

            cum_reward += reward
            self.test_curr_state = nextstate
        avg_reward = cum_reward / float(testing_steps)
        rewards.append(avg_reward)

        return avg_reward

    def burn_in_memory(self, curr_state):
        # Initialize your replay memory with a burn_in number of episodes / transitions.
        cnt = 0
        while self.burn_in > cnt:
            # Randomly selecting action for burn in. Not sure if this is correct.
            action = self.env.action_space.sample()
            next_state, reward, _, _ = self.env.step(action)

            self.replay_buffer.add(curr_state, action, reward / 100.0,
                                   next_state, 0)

            curr_state = next_state
            cnt = cnt + 1
        return curr_state

    def get_input_tensor(self, obs):
        ''' Returns an input tensor from the observation. '''
        iV = np.zeros((1, 3, 11, 11))
        iS = np.zeros((1, 4))

        iV[0] = np.moveaxis(obs['vision'], -1, 0)
        iS[0] = np.concatenate((obs['scent'], np.array([int(obs['moved'])])),
                               axis=0)
        iVt, iSt = torch.from_numpy(iV).float(), torch.from_numpy(iS).float()
        return iVt, iSt

    def get_input_tensors(self, batch, next_state=False):
        ''' Returns an input tensor created from the sampled batch. '''
        V = np.zeros((self.batch_size, 3, 11, 11))
        S = np.zeros((self.batch_size, 4))
        for i, tran in enumerate(batch):
            if next_state:
                obs = tran[3]  # next state
            else:
                obs = tran[0]  # current state

            V[i] = np.moveaxis(obs['vision'], -1, 0)
            S[i] = np.concatenate(
                (obs['scent'], np.array([int(obs['moved'])])), axis=0)
        Vt, St = torch.from_numpy(V).float(), torch.from_numpy(S).float()
        return Vt, St

    def get_output_tensors(self, batch):
        ''' Returns an output tensor created from the sampled batch. '''
        Y = np.zeros(self.batch_size)
        Vt, St = self.get_input_tensors(batch, next_state=True)
        q_values_a = self.net.get_Q_output(Vt, St)
        q_values_e = self.net.get_target_output(Vt, St)
        for i, tran in enumerate(batch):
            action = self.greedy_policy(q_values_a[i])
            Y[i] = tran[2] + self.df * q_values_e[i][action]

        Yt = torch.from_numpy(Y).float()
        return Yt
示例#16
0
    def __init__(self,
                 user_num,
                 action_dim,
                 n_features,
                 init_roi,
                 budget,
                 use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train
                 ):
        PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.action_dim = action_dim
        self.n_actions = 11
        self.n_features = n_features
        self.lr = 0.001
        self.update_times_per_train = update_times_per_train

        self.epsilon = 0.5
        self.epsilon_min = 0.01
        self.epsilon_dec = 0.2
        self.epsilon_dec_iter = 100

        self.epsilon_clip = 0.2
        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = False
        self.scope_name = "PPO-model"

        self.epoch = 0
        self.lam = 0.5

        self.update_step = 1
        self.kl_target = 0.01
        self.gamma = 1.
        self.method = 'clip'

        self.policy_logvar = 1e-7

        self.decay_rate = 0.9
        self.decay_steps = 5000

        self.global_ = tf.Variable(tf.constant(0))

        self.buffer_size = 1000 * max_trajectory_length
        self.batch_size = 500
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)
示例#17
0
class PPO_interface(LearningAgent, PIDAgent):

    def __init__(self,
                 user_num,
                 action_dim,
                 n_features,
                 init_roi,
                 budget,
                 use_budget_control,
                 use_prioritized_experience_replay,
                 max_trajectory_length,
                 update_times_per_train
                 ):
        PIDAgent.__init__(self, init_roi=init_roi, default_alpha=1, budget=budget, integration=1)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.action_dim = action_dim
        self.n_actions = 11
        self.n_features = n_features
        self.lr = 0.001
        self.update_times_per_train = update_times_per_train

        self.epsilon = 0.5
        self.epsilon_min = 0.01
        self.epsilon_dec = 0.2
        self.epsilon_dec_iter = 100

        self.epsilon_clip = 0.2
        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = False
        self.scope_name = "PPO-model"

        self.epoch = 0
        self.lam = 0.5

        self.update_step = 1
        self.kl_target = 0.01
        self.gamma = 1.
        self.method = 'clip'

        self.policy_logvar = 1e-7

        self.decay_rate = 0.9
        self.decay_steps = 5000

        self.global_ = tf.Variable(tf.constant(0))

        self.buffer_size = 1000 * max_trajectory_length
        self.batch_size = 500
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.alpha,
                                                                     max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size, save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')

        self.r_gmv = tf.placeholder(tf.float32, [None, ], name='r_gmv')
        self.r_cost = tf.placeholder(tf.float32, [None, ], name='r_cost')
        self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr")
        self.r = tf.placeholder(tf.float32, [None, ], name='r')
        self.a = tf.placeholder(tf.int32, [None, ], name='a')
        self.adv = tf.placeholder(tf.float32, [None, ], name='advantage')
        self.done = tf.placeholder(tf.float32, [None, ], name='done')
        self.gmv_return_value = tf.placeholder(tf.float32, [None, ], name='gmv_return')
        self.cost_return_value = tf.placeholder(tf.float32, [None, ], name='cost_return')
        self.return_value = tf.placeholder(tf.float32, [None, ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(tf.float32, [None], name="important_sampling_weight")

        self.a_eval = self._build_action_net(self.s, variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(self.s, variable_scope="actor_target_net")
        self.critic_gmv = self._build_q_net(self.s, variable_scope="critic_eval_gmv_net")
        self.critic_cost = self._build_q_net(self.s, variable_scope="critic_eval_cost_net")
        self.critic = self.critic_gmv - self.roi_thr * self.critic_cost

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))

        print(ae_params)
        print(at_params)

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group([tf.assign(t, e) for t, e in zip(at_params, ae_params)])

        self._build_loss()

        self._pick_loss()

        with tf.variable_scope('train'):
            self.gmv_ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.gmv_loss)
            self.cost_ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.cost_loss)
            self.ctrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss)
            self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss)

        with tf.variable_scope('roi'):
            self.max_longterm_roi = self.critic_gmv / (self.critic_cost + 1e-4)

    def _pick_loss(self):
        self.has_target_net = True
        self.critic_loss = self.closs

        self.gmv_loss = self.gmv_closs
        self.cost_loss = self.cost_closs
        self.actor_loss = self.aloss

    def _build_loss(self):
        with tf.variable_scope('critic'):
            self.gmv_c_loss = self.gmv_return_value - self.critic_gmv
            self.cost_c_loss = self.cost_return_value - self.critic_cost
            self.c_loss = self.return_value - self.critic

            self.gmv_closs = tf.reduce_mean(tf.square(self.gmv_c_loss))
            self.cost_closs = tf.reduce_mean(tf.square(self.cost_c_loss))
            self.closs = tf.reduce_mean(tf.square(self.c_loss))

            self.advantage = self.return_value - self.critic

        with tf.variable_scope('surrogate'):

            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            pi_prob = tf.gather_nd(params=self.a_eval, indices=a_indices)
            oldpi_prob = tf.gather_nd(params=self.a_target, indices=a_indices)
            ratio = pi_prob / (oldpi_prob + 1e-8)
            surr = ratio * self.adv
            if self.method == 'kl_pen':

                kl = tf.distributions.kl_divergence(self.a_target, self.a_eval)
                self.kl_mean = tf.reduce_mean(kl)
                self.aloss = -(tf.reduce_mean(surr - self.lam * kl))
            else:
                self.aloss = -tf.reduce_mean(tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - self.epsilon_clip, 1. + self.epsilon_clip) * self.adv))

    def update_target(self, sess):
        if self.epoch % self.replace_target_iter == 0:
            sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)

        self.epoch += 1

        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess)
        else:

            policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec, self.epsilon_min)
            print("update epsilon:", self.epsilon)
        return True, [policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns,
                      gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns,
                      cost_loss, cost_montecarlo_loss, cost_q_eval,
                      cost_returns], self.get_memory_returns(), self.epsilon

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            l1 = tf.layers.dense(state, n_features // 2, tf.nn.relu)
            l2 = tf.layers.dense(l1, n_features // 4, tf.nn.relu)
            a_prob = tf.layers.dense(l2, self.n_actions, tf.nn.softmax)
        return a_prob

    def _build_q_net(self, state, variable_scope, reuse=False):

        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id", shape=[self.user_num, 20], initializer=initializers.xavier_initializer(),
                trainable=True, dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            l1 = tf.layers.dense(state, n_features // 2, tf.nn.relu)
            l2 = tf.layers.dense(l1, n_features // 4, tf.nn.relu)
            v = tf.layers.dense(l2, 1)
        return v[:, 0]

    def train_normal(self, sess):
        policy_loss, policy_entropy = 0, 0
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0
        cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        for idx in range(self.update_times_per_train):

            sample_indices = self.replay_buffer.make_latest_index(self.batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index(
                sample_indices)

            adv = sess.run(self.advantage, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr})

            ret = sess.run(self.return_value, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr})

            criti = sess.run(self.critic_cost, {self.s: obs, self.return_value: returns, self.roi_thr: roi_thr})

            [sess.run([self.ctrain_op, self.gmv_ctrain_op, self.cost_ctrain_op], feed_dict={
                self.adv: adv,
                self.s: obs,
                self.a: act,
                self.r_gmv: rew_gmv,
                self.r_cost: rew_cost,
                self.r: rew,
                self.done: done,
                self.gmv_return_value: gmv_returns,
                self.cost_return_value: cost_returns,
                self.return_value: returns,
                self.roi_thr: roi_thr}) for _ in range(self.update_step)]

            if self.method == 'kl_pen':
                for _ in range(self.update_step):
                    _, kl, loss, gmv_eval, cost_eval = sess.run(
                        [self.atrain_op, self.kl_mean, self.closs, self.critic_gmv, self.critic_cost],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r_gmv: rew_gmv,
                            self.r_cost: rew_cost,
                            self.r: rew,

                            self.done: done,
                            self.gmv_return_value: gmv_returns,
                            self.cost_return_value: cost_returns,
                            self.return_value: returns,
                            self.roi_thr: roi_thr})
                    if kl > 4 * self.kl_target:
                        break
                if kl < self.kl_target / 1.5:
                    self.lam /= 2
                elif kl > self.kl_target * 1.5:
                    self.lam *= 2
                self.lam = np.clip(self.lam, 1e-4, 10)
            else:

                for _ in range(self.update_step):
                    _, loss, q_eval, gmv_loss, gmv_q_eval, cost_loss, cost_q_eval \
                        = sess.run(
                        [self.atrain_op, self.closs, self.critic,
                         self.gmv_loss, self.critic_gmv,
                         self.cost_loss, self.critic_cost],
                        feed_dict={
                            self.adv: adv,
                            self.s: obs,
                            self.a: act,
                            self.r_gmv: rew_gmv,
                            self.r_cost: rew_cost,
                            self.r: rew,

                            self.done: done,
                            self.gmv_return_value: gmv_returns,
                            self.cost_return_value: cost_returns,
                            self.return_value: returns,
                            self.roi_thr: roi_thr
                        })

        return policy_loss, policy_entropy, loss, montecarlo_loss, np.average(q_eval), np.average(returns), \
               gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \
               cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)

    def __make_hardreplace_exp__(self, vals, target_vals):
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(var))

        expression = tf.group(*expression)
        return expression

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        new_trajectory_gmv = other_info["gmv"]
        new_trajectory_cost = other_info["cost"]
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer, new_trajectory, gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

        add_episode(self.gmv_replay_buffer, new_trajectory_gmv, gamma=self.gamma)
        add_episode(self.cost_replay_buffer, new_trajectory_cost, gamma=self.gamma)

    def __epsilon_greedy__(self, sess, observation, roi_thr):
        if np.random.uniform() < self.epsilon:

            s = observation[np.newaxis, :]

            prob_weights = sess.run(self.a_eval, feed_dict={self.s: s})
            a = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())

            bid = a

        else:

            bid = self.__greedy__(sess, observation, roi_thr)

        return bid

    def __greedy__(self, sess, observation, roi_thr):

        s = observation[np.newaxis, :]

        prob_weights = sess.run(self.a_eval, feed_dict={self.s: s})
        a = np.argmax(prob_weights, axis=1)[0]
        bid = a

        return bid

    def choose_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        return self.__epsilon_greedy__(sess, observation, roi_thr)

    def greedy_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        bid = self.__greedy__(sess, observation, roi_thr)
        if self.use_budget_control:
            user_idx = other_info["user_idx"]
            request_idx = other_info["request_idx"]
            roi_threshold = self.get_roi_threshold()
            if request_idx == 0:
                observations = observation[np.newaxis, :]
                max_plongterm_roi = sess.run(
                    self.max_longterm_roi,
                    feed_dict={
                        self.s: observations,
                        self.a: [bid]

                    }
                )

                if max_plongterm_roi >= roi_threshold:
                    self.explore_user(user_idx)

                    return bid
                else:

                    return 0.
            else:
                if self.is_user_selected(user_idx):

                    return bid
                else:
                    return 0
        else:

            return bid

    def get_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.greedy_action(sess, obs, other_info)
        else:
            discrete_action = self.choose_action(sess, obs, other_info)
        bid_max = MultiUserEnv.bid_max
        bid_min = MultiUserEnv.bid_min

        other_action_info = {
            "learning_action": discrete_action
        }
        return bid_min + (bid_max - bid_min) / (self.n_actions - 1) * discrete_action, other_action_info

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def _is_exploration_enough(self, min_pool_size):
        if self.use_prioritized_experience_replay:
            return len(self.prioritized_replay_buffer) >= min_pool_size
        else:
            return len(self.replay_buffer) >= min_pool_size
示例#18
0
def main(args):
    '''Initialize replay buffer, models, and environment.'''
    if args.logging > 0:
        import time
    replay_state_dim = 12
    if args.door == 1 or args.door == 3:
        replay_state_dim += 1
    elif args.door == 5:
        replay_state_dim += 1 + 3 * 2
    elif args.drawer:
        replay_state_dim += 3 * 3 + 1
    if not args.robot:
        replay_buffer = ReplayBuffer(
                                max_replay_buffer_size = args.replay_buffer_size,
                                trajectory_length=args.traj_length,
                                state_dim=replay_state_dim,
                                action_dim=args.action_dim,
                                savedir=args.log_dir,
                                )

    img_buffer = ImageBuffer( # 3x64x64 pixels
                                trajectory_length=args.num_traj_per_epoch*args.traj_length,
                                action_dim=args.action_dim,
                                savedir=args.log_dir,
                                memory_size=500,
                                )

    if args.logging == 0: # no env logging
        args.verbose = False
    if args.robot:
        import gym
        import franka_env
        env = gym.make("Franka-v0")
    else:
        env = Tabletop(
                    log_freq=args.env_log_freq, 
                    filepath=args.log_dir + '/env',
                    door=args.door,
                    drawer=args.drawer,
                    hard=args.hard,
                    verbose=args.verbose)

    if args.logging == 2:
        viz_env = Tabletop(
                    door=args.door,
                    drawer=args.drawer,
                    hard=args.hard,
                    log_freq=args.env_log_freq, 
                    filepath=None,
                    verbose=False)
    else:
        viz_env = None

    ''' Initialize models '''
    enc_dec = SimpleVAE(device, args.latent_dim, args.log_dir)
    if args.reload is not None:
        enc_dec.load_state_dict(torch.load(args.reload + '/enc_dec/{}model.bin'.format(args.reload_epoch)))
    enc_dec.to(device)
    enc_params = list(enc_dec.params)
    enc_optimizer = optim.Adam(enc_params, lr=1e-3) # just for enc_dec
     
    dynamics_models = None
    if args.dynamics_var:
        dynamics_models = []
        dyn_params = None
        for a in range(5):
            dynamics_model = TransitionModel(args.latent_dim, args.action_dim, args.log_dir, num=a)
            dynamics_model.to(device)
            dynamics_models.append(dynamics_model)
            if a == 0:
                dyn_params = list(dynamics_model.params)
            else:
                dyn_params += list(dynamics_model.params)
    else:
        dynamics_model = TransitionModel(args.latent_dim, args.action_dim, args.log_dir, recurrent=False)
        if args.reload is not None:
            dynamics_model.load_state_dict(torch.load(args.reload + '/dynamics_model/{}model.bin'.format(args.reload_epoch))) 
        dynamics_model.to(device)
        dyn_params = list(dynamics_model.params)
    dyn_optimizer = optim.Adam(dyn_params, lr=1e-3) # just for transition model

    # If using Classifiers
    classifiers = None
    if args.use_classifiers is not None:
        classifiers = []
        for i in range(args.num_classifiers):
            # classifier = BinClassifier(args.latent_dim, args.log_dir + '/classifier', i)
            if args.instance_normalized:
                classifier = BinClassifier_InsNorm(args.latent_dim, args.log_dir + '/classifier', i)
            else:
                classifier = BinClassifier(args.latent_dim, args.log_dir + '/classifier', i)
            if args.reload is not None:
                classifier.load_state_dict(torch.load(args.reload + '/classifier/{}/{}model.bin'.format(i, args.reload_epoch)))
            classifier.to(device)
            classifiers.append(classifier)
            if i == 0:
                c_params = list(classifier.params)
            else:
                c_params += list(classifier.params)
        c_optimizer = optim.Adam(c_params, lr=1e-3)
   
    # If using SMM
    density_vae = None
    goal_vae = None
    if args.smm:
        density_vae = VAEGoal(args.log_dir + '/density_model')
        goal_vae = VAEGoal(args.log_dir + '/goal')
        density_vae.to(device)
        goal_vae.to(device)
        d_params = list(density_vae.params)
        g_params = list(goal_vae.params)
        g_optimizer = optim.Adam(d_params, lr=1e-3) 
        d_optimizer = optim.Adam(g_params, lr=1e-3) 

    ''' Return goals '''
    goals = np.array(get_goal_imgs(args, env, filepath=args.log_dir + '/goal_ims')) 
    goals = goals / 255. 
    goals = torch.tensor(goals).float().to(device)
    goals = goals.permute(0, 3, 1, 2)

    # If flag 0, no training losses log
    if args.logging > 0:
        hist = Hist(args.log_dir)

    env.max_path_length = args.traj_length * args.num_traj_per_epoch
    
    # Clean the env memory to make sure above code isn't affecting the env
    if not args.robot:
        env.initialize()
    ob, env_info = None, None
    for epoch in gt.timed_for(range(args.num_epochs), save_itrs=True):
        if args.logging > 0:
            start = time.time()
        init_low_dim = None
        obs_sample = []
        high_dim_sample = []
        ob, env_info = env.reset_model(add_noise=args.add_noise)
        if epoch == 0 and args.logging > 0 and not args.robot: 
            init_array = env.get_obs() * 255.
            init_img = Image.fromarray(init_array.astype(np.uint8))
            init_img.save(args.log_dir + '/init.png')
        '''
        Log low dim state for plotting block interaction bars
        '''
        if not args.robot:
            init_low_dim = get_obs(args, env_info) 
            obs_sample.append(init_low_dim)

        init_ob = ob

        eps_obs = []
        eps_next = []
        eps_act = []

        for i in range(args.num_traj_per_epoch):
            ob = torch.tensor(ob).unsqueeze(0).permute(0, 3, 1, 2).float().to(device)
            if i == 0: 
                high_dim_sample.append(ptu.get_numpy(ob.squeeze(0)))
                
            if epoch < 100:
                actions = get_random_action_sequence(env, args.traj_length, sample_sz = 1)
                actions = ptu.get_numpy(actions).squeeze(0)
            else:
                sorted_rewards, sorted_actions, sorted_preds = plan_actions(args, env, ob, dynamics_model, enc_dec, classifiers=classifiers, goal_vae=goal_vae, density_vae=density_vae, dynamics_models=dynamics_models)
                # Randomly select from the top K with highest reward
                act = np.random.choice(TOP_K)
                actions = sorted_actions[act]

                '''
                Log best and worst 3 trajectories (gifs and final state imgs)
                '''
                if args.logging > 0 and epoch % args.model_log_freq == 0:
                    log_rankings(args, enc_dec, hist.rankings_dir, viz_env, init_low_dim, sorted_actions, sorted_preds, epoch, i, sorted_rewards)

            action_sample = []
            for action in actions:
                # With low probability take a random action
                rand = np.random.uniform(0.0, 1.0)
                if rand < args.random_act_prob:
                    action = get_random_action_sequence(env, 1, sample_sz = 1).cpu().detach().numpy()
                    action = action.reshape(args.action_dim)

                next_ob, reward, terminal, env_info = env.step(action)
                ob = next_ob
                next_ob = torch.tensor(next_ob).permute(2, 0, 1).float().to(device).unsqueeze(0) # change to 3 x 64 x 64 obs
                high_dim_sample.append(ptu.get_numpy(next_ob.squeeze(0)))
                if not args.robot:
                    obs = get_obs(args, env_info)
                    obs_sample.append(obs) 
                    init_low_dim = obs_sample[-1].copy()
                action_sample.append(action)

            if not args.robot:
                replay_buffer.add_sample(
                        states=obs_sample[:-1],
                        next_states=obs_sample[1:],
                        actions=action_sample,
                        )
                last_obs = obs_sample[-1]

            eps_obs.append(high_dim_sample[:-1])
            eps_next.append(high_dim_sample[1:])
            eps_act.append(action_sample)
            
            last_frame = high_dim_sample[-1]
            obs_sample = []
            high_dim_sample = []
            # This becomes the init frame of the next traj
            if not args.robot:
                obs_sample.append(last_obs)
            high_dim_sample.append(last_frame)

        # reshape to -1, EPS SZ 50, 3, 64, 64
        eps_obs = np.array(eps_obs).reshape(-1, args.num_traj_per_epoch * args.traj_length, 3, 64, 64)
        if epoch == 1:
            with imageio.get_writer(args.log_dir + '/trial.gif', mode='I') as writer:
                for k, frame in enumerate(eps_obs[0]):
                    img = np.array(frame)
                    img = img.transpose((1, 2, 0)) * 255.0
                    writer.append_data(img.astype('uint8'))
        eps_next = np.array(eps_next).reshape(-1, args.num_traj_per_epoch * args.traj_length, 3, 64, 64)
        eps_act = np.array(eps_act).reshape(-1, args.num_traj_per_epoch * args.traj_length, img_buffer.action_dim)
        img_buffer.add_sample(
                    states=eps_obs,
                    next_states=eps_next,
                    actions=eps_act,
                    )
        
        # Gradually increase the horizon for training the dynamics model
        predlen = 10
        if epoch < 300:
            predlen = 8
        if epoch < 150:
            predlen = 4
        if epoch < 50:
            predlen = 2

        if epoch % args.update_freq == 0:
            print("Updating")
            if args.logging > 0 and epoch % args.loss_log_freq == 0:
                epoch_dynamics_loss = np.zeros((args.grad_steps_per_update,), dtype=float)
                epoch_vae_loss = np.zeros((args.grad_steps_per_update,), dtype=float)
                if args.use_classifiers is not None:
                    epoch_auxillary_loss = np.zeros((args.classifiers_grad_steps,), dtype=float)
                else:
                    epoch_auxillary_loss = np.zeros((args.grad_steps_per_update,), dtype=float)

            for grstep in range(args.grad_steps_per_update):
                losses = []
                # Return [batch_sz, predlen, 3, 64, 64]
                obs, next_obs, actions, success = img_buffer.draw_samples(batch_size=args.batch_sz, length=predlen)

                obs = torch.tensor(obs).float().to(device) 
                next_obs = torch.tensor(next_obs).float().to(device)
                actions = torch.tensor(actions).float().to(device)
                _, _, _, _, obs_z = enc_dec.forward(obs)
                _, _, _, _, next_z = enc_dec.forward(next_obs)
                g_ind = np.random.randint(0, len(goals), args.batch_sz) 
                g_samples = goals[g_ind]
                _, _, _, _, goal_z = enc_dec.forward(g_samples.unsqueeze(1))

                if args.dynamics_var:
                    ''' Train dynamics models in disagreement ensemble '''
                    dynamics_loss = train_disgrmt_ensemble(img_buffer, dynamics_models, enc_dec, dyn_optimizer, args.batch_sz, predlen)
                    auxillary_loss = dynamics_loss
                else:
                    dynamics_loss, pred_z = train_dynamics_model(dynamics_model, obs_z, next_z, actions, dyn_optimizer)
                if args.logging > 0 and epoch % args.model_log_freq == 0 and not args.dynamics_var:
                    dynamics_preds = enc_dec.dec(pred_z.float())
                    # decode pred_z through the decoder & compare with next_obs
                    for num in range(3):
                        dynamics_pred = dynamics_preds[num]
                        dynamics_true = next_obs[num]
                        dynamics_pred = ptu.get_numpy(dynamics_pred.permute(0, 2, 3, 1))
                        dynamics_true = ptu.get_numpy(dynamics_true.permute(0, 2, 3, 1))
                        dynamics_true = (dynamics_true * 255.).astype(np.uint8)
                        dynamics_pred = (dynamics_pred * 255.).astype(np.uint8)
                        path = args.log_dir + '/dynamics_preds/' + str(epoch)
                        if not os.path.exists(path):
                            os.makedirs(path)
                        with imageio.get_writer(path + '/train_true' + str(num) + '.gif', mode='I') as writer:
                            for e in range(len(dynamics_true)):
                                writer.append_data(dynamics_true[e])
                        with imageio.get_writer(path + '/train_pred' + str(num) + '.gif', mode='I') as writer:
                            for e in range(len(dynamics_pred)):
                                writer.append_data(dynamics_pred[e])

                ''' Train classifiers ''' 
                if args.use_classifiers is not None and grstep < args.classifiers_grad_steps:
                    score_path = None
                    if args.logging > 0 and epoch % args.model_log_freq == 0:
                        score_path = args.log_dir + '/classifier_scores/' + str(epoch)
                        if not os.path.exists(score_path):
                            os.makedirs(score_path)
                    auxillary_loss = train_classifiers(classifiers, enc_dec, obs, goals, c_optimizer, args.batch_sz, score_path)
                
                ''' Update SMM density models '''
                if args.smm:
                    auxillary_loss = train_smm_density_models(density_vae, goal_vae, obs_z, goal_z, d_optimizer, g_optimizer)
                
                '''Train main vae'''
                vae_loss, g_rec, ng_rec = train_vae(args, enc_dec, obs, g_samples, enc_optimizer, args.beta)
                if args.logging > 0 and epoch % args.model_log_freq == 0:
                    # save g_rec & g_samples
                    if g_rec is not None:
                        g_rec = g_rec.cpu().detach()
                        g_rec = g_rec * 255.0
                        r_imgs = g_rec.squeeze(1).permute(0, 2, 3, 1).reshape(-1, 64, 64, 3)
                        r_imgs = ptu.get_numpy(r_imgs).astype(np.uint8)
                        g_true = g_samples * 255.0
                        t_imgs = g_true.permute(0, 2, 3, 1).reshape(-1, 64, 64, 3)
                        t_imgs = ptu.get_numpy(t_imgs).astype(np.uint8)
                        for im in range(5):
                            img = Image.fromarray(r_imgs[im])
                            path = args.log_dir + '/vae_recs/' + str(epoch)
                            if not os.path.exists(path):
                                os.makedirs(path)
                            img.save(path + '/g_rec' + str(im) + '.png')
                            img = Image.fromarray(t_imgs[im])
                            img.save(path + '/g_true' + str(im) + '.png')

                    ng_rec = ng_rec * 255.0
                    r_imgs = ng_rec.squeeze(1).permute(0, 2, 3, 1).reshape(-1, 64, 64, 3)
                    r_imgs = ptu.get_numpy(r_imgs).astype(np.uint8)
                    ng_true = obs[:,0,:,:,:] * 255.0
                    t_imgs = ng_true.permute(0, 2, 3, 1).reshape(-1, 64, 64, 3)
                    t_imgs = ptu.get_numpy(t_imgs).astype(np.uint8)
                    for im in range(5):
                        img = Image.fromarray(r_imgs[im])
                        path = args.log_dir + '/vae_recs/' + str(epoch)
                        if not os.path.exists(path):
                            os.makedirs(path)
                        img.save(path + '/ng_rec' + str(im) + '.png')
                        img = Image.fromarray(t_imgs[im])
                        img.save(path + '/ng_true' + str(im) + '.png')

                if args.logging > 0 and epoch % args.loss_log_freq == 0:
                    epoch_dynamics_loss[grstep] = dynamics_loss
                    epoch_vae_loss[grstep] = vae_loss
                    if args.dynamics_var or args.smm or grstep < args.classifiers_grad_steps:
                        epoch_auxillary_loss[grstep] = auxillary_loss

        if args.logging > 0:
            end = time.time()
            print("===== EPISODE {} FINISHED IN {}s =====".format(epoch, end - start))

        if args.logging > 0 and epoch % args.loss_log_freq == 0 and epoch > 0:
            hist.save_losses(
                            epoch_auxillary_loss.mean(), # e.g. SMM, disagreement, classifiers max
                            epoch_dynamics_loss.mean(),
                            epoch_vae_loss.mean(),
                            )
            if args.logging == 2:
                print(hist.report_losses)
        
        if epoch % args.model_log_freq == 0:
            torch.save(enc_dec.state_dict(), enc_dec.savedir + '/{}model.bin'.format(epoch))
            if args.dynamics_var:
                for model in dynamics_models:
                    torch.save(model.state_dict(), model.savedir + '/{}model.bin'.format(epoch))
            else:
                torch.save(dynamics_model.state_dict(), dynamics_model.savedir + '/{}model.bin'.format(epoch))
            if args.use_classifiers is not None:
                for classifier in classifiers:
                    torch.save(classifier.state_dict(), classifier.savedir + '/{}model.bin'.format(epoch))
            if args.smm:
                torch.save(goal_vae.state_dict(), goal_vae.savedir + '/{}model.bin'.format(epoch))
                torch.save(density_vae.state_dict(), density_vae.savedir + '/{}model.bin'.format(epoch))
            if args.logging > 0:
                hist.save_losses_txt()

        if epoch == args.max_epoch:
            assert(False)
示例#19
0
class DQN_interface(LearningAgent):
    def __init__(
        self,
        n_actions=11,
        n_features=29,
        use_prioritized_experience_replay=True,
        max_trajectory_length=20,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.gamma = 1.

        self.lr = 0.001
        self.epsilon = 0.5
        self.epsilon_min = 0
        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 1000
        self.replace_target_iter = 100
        self.soft_update_iter = 1
        self.softupdate = False
        self.scope_name = "DQN-model"

        self.epoch = 0

        self.buffer_size = 5000 * max_trajectory_length
        self.batch_size = 512
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        self.margin_constant = 2

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features],
                                 name='s_')
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')
        self.a = tf.placeholder(tf.int32, [
            None,
        ], name='a')
        self.done = tf.placeholder(tf.float32, [
            None,
        ], name='done')
        self.return_value = tf.placeholder(tf.float32, [
            None,
        ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(
            tf.float32, [None], name="important_sampling_weight")

        self.q_eval = self._build_q_net(self.s,
                                        self.n_actions,
                                        variable_scope="eval_net")
        self.q_next = self._build_q_net(self.s_,
                                        self.n_actions,
                                        variable_scope="target_net")

        t_params = scope_vars(absolute_scope_name("target_net"))
        e_params = scope_vars(absolute_scope_name("eval_net"))

        with tf.variable_scope('hard_replacement'):
            self.target_replace_op = tf.group(
                [tf.assign(t, e) for t, e in zip(t_params, e_params)])

        with tf.variable_scope('soft_update'):
            self.update_target_q = self.__make_update_exp__(e_params, t_params)

        with tf.variable_scope('q_target'):
            self.td0_q_target = tf.stop_gradient(
                self.r + self.gamma * (1. - self.done) *
                tf.reduce_max(self.q_next, axis=1, name='Qmax_s_'))

            target_action = tf.argmax(self.q_eval,
                                      axis=-1,
                                      output_type=tf.int32)
            target_a_indices = tf.stack(
                [tf.range(tf.shape(self.a)[0], dtype=tf.int32), target_action],
                axis=1)
            target_q_sa = tf.gather_nd(params=self.q_next,
                                       indices=target_a_indices)
            self.double_dqn_target = tf.stop_gradient(self.r + self.gamma *
                                                      (1. - self.done) *
                                                      target_q_sa)

            self.montecarlo_target = self.return_value

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack(
                [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a],
                axis=1)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval,
                                             indices=a_indices)

        with tf.variable_scope('loss'):
            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.loss, var_list=e_params)

    def _pick_loss(self):
        self.loss = self.double_dqn_loss
        self.priority_values = self.doubel_dqn_error

    def _build_loss(self):

        if self.use_prioritized_experience_replay:

            self.dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph * tf.squared_difference(
                    self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss'))

            self.double_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.double_dqn_target,
                                      self.q_eval_wrt_a,
                                      name='Double_DQN_error'))
        else:

            self.dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_q_target,
                                      self.q_eval_wrt_a,
                                      name='TD0_loss'))

            self.double_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.double_dqn_target,
                                      self.q_eval_wrt_a,
                                      name='Double_DQN_error'))

        self.montecarlo_loss = tf.reduce_mean(
            tf.squared_difference(self.montecarlo_target,
                                  self.q_eval_wrt_a,
                                  name='MonteCarlo_error'))

        self.td0_error = tf.abs(self.td0_q_target - self.q_eval_wrt_a)
        self.doubel_dqn_error = tf.abs(self.double_dqn_target -
                                       self.q_eval_wrt_a)
        self.montecarlo_error = tf.abs(self.montecarlo_target -
                                       self.q_eval_wrt_a)

        margin_diff = tf.one_hot(self.a,
                                 self.n_actions,
                                 on_value=0.,
                                 off_value=1.,
                                 dtype=tf.float32) * self.margin_constant
        self.margin_loss = tf.reduce_mean(
            tf.reduce_max(self.q_eval + margin_diff, axis=1, keepdims=False) -
            self.q_eval_wrt_a)
        self.mse_margin_loss = tf.reduce_mean(
            tf.squared_difference(
                tf.reduce_max(self.q_eval + margin_diff,
                              axis=1,
                              keepdims=False), self.q_eval_wrt_a))

    def _build_q_net(self, state, n_actions, variable_scope):
        with tf.variable_scope(variable_scope):
            fc1 = tf.layers.dense(state,
                                  units=self.n_features,
                                  activation=tf.nn.relu,
                                  name='fc1')
            q_out = tf.layers.dense(fc1, units=n_actions, name='q')
            return q_out

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(
                var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def __make_hardreplace_exp__(self, vals, target_vals):
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(var))

        expression = tf.group(*expression)
        return expression

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer,
                        new_trajectory,
                        gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

    def get_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.greedy_action(sess, obs)
        else:
            discrete_action = self.choose_action(sess, obs)

        other_action_info = {"learning_action": discrete_action}
        return 3 * discrete_action, other_action_info

    def choose_action(self, sess, observation):

        observation = observation[np.newaxis, :]
        if np.random.uniform() < self.epsilon:
            action = np.random.randint(0, self.n_actions)
        else:

            actions_value = sess.run(self.q_eval,
                                     feed_dict={self.s: observation})
            action = np.argmax(actions_value, axis=1)[0]

        return action

    def greedy_action(self, sess, single_observation):
        observation = single_observation[np.newaxis, :]
        actions_value = sess.run(self.q_eval, feed_dict={self.s: observation})
        greedy_action = np.argmax(actions_value, axis=1)[0]
        return greedy_action

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def _is_exploration_enough(self, min_pool_size):
        if self.use_prioritized_experience_replay:
            return len(self.prioritized_replay_buffer) >= min_pool_size
        else:
            return len(self.replay_buffer) >= min_pool_size

    def update_target(self, sess):
        if self.softupdate:

            if self.epoch % self.soft_update_iter == 0:
                sess.run(self.update_target_q)
        else:

            if self.epoch % self.replace_target_iter == 0:
                sess.run(self.target_replace_op)

    def train(self, sess):
        self.update_target(sess)

        self.epoch += 1
        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns = self.train_prioritized(
                sess)
        else:

            loss, montecarlo_loss, q_eval, returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec,
                               self.epsilon_min)
            print("update epsilon:", self.epsilon)
        return True, [loss, montecarlo_loss, q_eval,
                      returns], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0
        for idx in range(1):
            sample_indices = self.prioritized_replay_buffer.make_index(
                self.batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, q_eval, montecarlo_loss, priority_values = sess.run(
                [
                    self._train_op, self.loss, self.q_eval_wrt_a,
                    self.montecarlo_loss, self.priority_values
                ],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(
                sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0
        for idx in range(1):
            sample_index = self.replay_buffer.make_index(self.batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_index)
            _, loss, q_eval, montecarlo_loss = sess.run(
                [
                    self._train_op, self.loss, self.q_eval_wrt_a,
                    self.montecarlo_loss
                ],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                })
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)
示例#20
0
class DQNAgent:
    def __init__(self, gamma, action_number, minibatch, episodes, begin_train,
                 train_step, begin_copy, copy_step, epsilon_delta,
                 epsilon_start, epsilon_end, load_model, path_to_load,
                 path_to_save, episode_steps, episode_to_save, max_buffer_len):

        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch
        self.action_number = action_number
        self.gamma = gamma

        # Episode Params

        self.begin_train = begin_train
        self.begin_copy = begin_copy
        self.copy_step = copy_step
        self.train_step = train_step
        self.episodes = episodes
        self.episode_steps = episode_steps
        self.episode_to_save = episode_to_save

        # I/O params

        self.path_to_load = path_to_load
        self.path_to_save = path_to_save
        self.load_model = load_model

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        self.model = BoxModel((150, 100, 1), action_number).to(self.device)
        if self.load_model:
            self.model.load_state_dict(torch.load(self.path_to_load))

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(
                torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(observation):
        rgb = observation[30:180, 30:130] / 255
        r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return gray.reshape(1, 150, 100)

    def transition_process(self, o_state, o_act, o_reward, o_next_state,
                           o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state)
        q_next = self.model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q.gather(1, o_act.unsqueeze(1)).squeeze(1) -
                torch.autograd.Variable(y_hat.data)).pow(2).mean()
        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()

    def print(self, episode, reward_black, reward_white, epsilon):
        print(f"For episode {episode} reward white - "
              f"{reward_white} and black - {reward_black},"
              f"epsilon - {epsilon}")

    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        start = time()
        print("Begin to Train")

        for episode in range(self.episodes):
            observation = env.reset()
            self.state = self.preprocess_observation(observation)
            reward_black, reward_white, total_reward = 0, 0, 0
            for episode_steps in range(self.episode_steps):
                action = self.epsilon_greedy()
                next_observation, reward, done, _ = env.step(action)
                reward_black += (reward < 0) * abs(reward)
                reward_white += (reward > 0) * reward
                total_reward += reward
                next_state = self.preprocess_observation(next_observation)
                self.replay_buffer.push(self.state, action, reward, next_state,
                                        done)
                if len(self.replay_buffer) >= self.begin_train:
                    self.train_model()
                # if (episode_step >= self.begin_copy) and (episode_step % self.copy_step == 0):
                #     plt.plot(total_reward)
                #     plt.show()
                # self.const_model = self.model.clone()
                if done:
                    break

            self.reduce_epsilon(episode)
            if episode != 0 and episode % self.episode_to_save == 0:
                torch.save(self.model.state_dict(), self.path_to_save)
                plt.plot(self.rewards)
                plt.show()

            self.rewards_black.append(reward_black)
            self.rewards_white.append(reward_white)
            self.rewards.append(total_reward)
            self.print(episode,
                       reward_black=reward_black,
                       reward_white=reward_white,
                       epsilon=self.epsilon)
            print(time() - start)

    def play(self, env: gym.wrappers.time_limit.TimeLimit):
        observation = env.reset()
        reward_black, reward_white, total_reward = 0, 0, 0
        for episode_steps in range(self.episode_steps):
            state = self.preprocess_observation(observation)
            state = torch.autograd.Variable(
                torch.FloatTensor(state).to(self.device).unsqueeze(0))
            print(self.model(state))
            action = self.model(state).max(1)[1].item()
            observation, reward, done, _ = env.step(action)
            reward_black += (reward < 0) * abs(reward)
            reward_white += (reward > 0) * reward
            total_reward += reward
            sleep(0.01)
            env.render()
            if done:
                break
        print(total_reward)
示例#21
0
class DQN2Net_interface(LearningAgent, PIDAgent):
    def __init__(
        self,
        user_num,
        n_actions,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        use_prioritized_experience_replay,
        max_trajectory_length,
        update_times_per_train=1,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=2)
        self.user_num = user_num
        self.use_budget_control = use_budget_control
        self.update_times_per_train = update_times_per_train
        self.n_actions = n_actions
        self.n_features = n_features
        self.gamma = 1.
        self.lr = 0.001

        self.user_based_adjust_times = 40

        self.epsilon = 0.4
        self.epsilon_min = 0.05

        self.epsilon_dec = 0.1
        self.epsilon_dec_iter = 5000 // self.user_based_adjust_times
        self.epsilon_dec_iter_min = 500 // self.user_based_adjust_times

        self.replace_target_iter = 1
        self.soft_update_iter = 1
        self.softupdate = True

        self.scope_name = "DQN-model"

        self.epoch = 0

        self.buffer_size = 1000 * max_trajectory_length

        self.batch_size = 512
        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size,
                                               save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        self.margin_constant = 2

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_q_net(self, state, n_actions, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id",
                shape=[self.user_num, 10],
                initializer=initializers.xavier_initializer(),
                trainable=True,
                dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(
                user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            fc1 = tf.layers.dense(
                state,
                units=n_features,
                activation=tf.nn.relu,
                name='fc1',
                kernel_initializer=initializers.xavier_initializer())

            fc2 = tf.layers.dense(
                fc1,
                units=n_features // 2,
                activation=tf.nn.relu,
                name='fc2',
                kernel_initializer=initializers.xavier_initializer())

            fc3 = tf.layers.dense(
                fc2,
                units=n_features // 2,
                activation=tf.nn.relu,
                name='fc3',
                kernel_initializer=initializers.xavier_initializer())
            q_out = tf.maximum(
                tf.layers.dense(
                    fc3,
                    units=n_actions,
                    name='q',
                    kernel_initializer=initializers.xavier_initializer()), 0)
            return q_out

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features],
                                 name='s_')
        self.r_gmv = tf.placeholder(tf.float32, [
            None,
        ], name='r_gmv')
        self.r_cost = tf.placeholder(tf.float32, [
            None,
        ], name='r_cost')
        self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr")
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')
        self.a = tf.placeholder(tf.int32, [
            None,
        ], name='a')
        self.done = tf.placeholder(tf.float32, [
            None,
        ], name='done')
        self.return_gmv_value = tf.placeholder(tf.float32, [
            None,
        ],
                                               name='return_gmv')
        self.return_cost_value = tf.placeholder(tf.float32, [
            None,
        ],
                                                name='return_cost')
        self.return_value = tf.placeholder(tf.float32, [
            None,
        ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(
            tf.float32, [None], name="important_sampling_weight")

        self.q_eval_gmv = self._build_q_net(self.s,
                                            self.n_actions,
                                            variable_scope="eval_gmv_net")
        self.q_next_gmv = self._build_q_net(self.s_,
                                            self.n_actions,
                                            variable_scope="target_gmv_net")
        self.q_eval_cost = self._build_q_net(self.s,
                                             self.n_actions,
                                             variable_scope="eval_cost_net")
        self.q_next_cost = self._build_q_net(self.s_,
                                             self.n_actions,
                                             variable_scope="target_cost_net")
        self.q_eval = self.q_eval_gmv - self.roi_thr * self.q_eval_cost
        self.q_next = self.q_next_gmv - self.roi_thr * self.q_next_cost

        t_gmv_params = scope_vars(absolute_scope_name("target_gmv_net"))
        e_gmv_params = scope_vars(absolute_scope_name("eval_gmv_net"))
        t_cost_params = scope_vars(absolute_scope_name("target_cost_net"))
        e_cost_params = scope_vars(absolute_scope_name("eval_cost_net"))

        with tf.variable_scope('hard_replacement'):
            self.target_gmv_replace_op = tf.group(
                [tf.assign(t, e) for t, e in zip(t_gmv_params, e_gmv_params)])
            self.target_cost_replace_op = tf.group([
                tf.assign(t, e) for t, e in zip(t_cost_params, e_cost_params)
            ])

        with tf.variable_scope('soft_update'):
            self.update_gmv_target_q = self.__make_update_exp__(
                e_gmv_params, t_gmv_params)
            self.update_cost_target_q = self.__make_update_exp__(
                e_cost_params, t_cost_params)

        with tf.variable_scope('q_target'):
            greedy_action_s_ = tf.argmax(self.q_next,
                                         axis=-1,
                                         name="td0_argmax_action",
                                         output_type=tf.int32)
            greedy_a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), greedy_action_s_
            ],
                                        axis=1)
            target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv,
                                           indices=greedy_a_indices)
            target_q_cost_sa = tf.gather_nd(params=self.q_next_cost,
                                            indices=greedy_a_indices)
            target_q_sa = tf.gather_nd(params=self.q_next,
                                       indices=greedy_a_indices)
            self.td0_q_gmv_target = tf.stop_gradient(self.r_gmv + self.gamma *
                                                     (1. - self.done) *
                                                     target_q_gmv_sa)
            self.td0_q_cost_target = tf.stop_gradient(self.r_cost +
                                                      self.gamma *
                                                      (1. - self.done) *
                                                      target_q_cost_sa)
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma *
                                                 (1. - self.done) *
                                                 target_q_sa)

            target_action = tf.argmax(self.q_eval,
                                      axis=-1,
                                      name="doubeldqn_argmax_action",
                                      output_type=tf.int32)
            target_a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), target_action
            ],
                                        axis=1)
            ddqn_target_q_gmv_sa = tf.gather_nd(params=self.q_next_gmv,
                                                indices=target_a_indices)
            ddqn_target_q_cost_sa = tf.gather_nd(params=self.q_next_cost,
                                                 indices=target_a_indices)
            ddqn_target_q_sa = tf.gather_nd(params=self.q_next,
                                            indices=target_a_indices)
            self.double_dqn_gmv_target = tf.stop_gradient(self.r_gmv +
                                                          self.gamma *
                                                          (1. - self.done) *
                                                          ddqn_target_q_gmv_sa)
            self.double_dqn_cost_target = tf.stop_gradient(
                self.r_cost + self.gamma *
                (1. - self.done) * ddqn_target_q_cost_sa)
            self.double_dqn_target = tf.stop_gradient(self.r + self.gamma *
                                                      (1. - self.done) *
                                                      ddqn_target_q_sa)

            self.montecarlo_gmv_target = self.return_gmv_value
            self.montecarlo_cost_target = self.return_cost_value
            self.montecarlo_target = self.return_value

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), self.a
            ],
                                 axis=1)
            self.q_eval_gmv_wrt_a = tf.gather_nd(params=self.q_eval_gmv,
                                                 indices=a_indices)
            self.q_eval_cost_wrt_a = tf.gather_nd(params=self.q_eval_cost,
                                                  indices=a_indices)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval,
                                             indices=a_indices)

        with tf.variable_scope('loss'):
            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.loss, var_list=e_gmv_params + e_cost_params)
            self._train_gmv_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.gmv_loss, var_list=e_gmv_params)
            self._train_cost_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cost_loss, var_list=e_cost_params)

        with tf.variable_scope('roi'):
            greedy_action_indices = tf.stack([
                tf.range(tf.cast(tf.shape(self.a)[0], dtype=tf.int32),
                         dtype=tf.int32), self.a
            ],
                                             axis=1)
            self.plongterm_roi = tf.gather_nd(
                params=self.q_eval_gmv, indices=greedy_action_indices) / (
                    tf.gather_nd(params=self.q_eval_cost,
                                 indices=greedy_action_indices) + 1e-6)

    def _pick_loss(self):
        self.has_target_net = True
        self.gmv_loss = self.gmv_double_dqn_loss
        self.cost_loss = self.cost_double_dqn_loss
        self.loss = self.double_dqn_loss
        self.priority_values = self.gmv_doubel_dqn_error + self.cost_doubel_dqn_error + self.doubel_dqn_error

    def _build_loss(self):

        if self.use_prioritized_experience_replay:

            self.gmv_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.td0_q_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='TD0_gmv_loss'))
            self.cost_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.td0_q_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='TD0_cost_loss'))
            self.dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph * tf.squared_difference(
                    self.td0_q_target, self.q_eval_wrt_a, name='TD0_loss'))

            self.gmv_double_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.double_dqn_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='Double_DQN_gmv_loss'))
            self.cost_double_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.double_dqn_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='Double_DQN_cost_loss'))
            self.double_dqn_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.double_dqn_target,
                                      self.q_eval_wrt_a,
                                      name='Double_DQN_error'))

            self.gmv_montecarlo_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.montecarlo_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='GMV_error'))
            self.cost_montecarlo_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.montecarlo_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='COST_error'))
            self.montecarlo_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.montecarlo_target,
                                      self.q_eval_wrt_a,
                                      name='MonteCarlo_error'))

        else:

            self.gmv_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_q_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='TD0_gmv_loss'))
            self.cost_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_q_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='TD0_cost_loss'))
            self.dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_q_target,
                                      self.q_eval_wrt_a,
                                      name='TD0_loss'))

            self.gmv_double_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.double_dqn_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='Double_DQN_gmv_loss'))
            self.cost_double_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.double_dqn_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='Double_DQN_cost_loss'))
            self.double_dqn_loss = tf.reduce_mean(
                tf.squared_difference(self.double_dqn_target,
                                      self.q_eval_wrt_a,
                                      name='Double_DQN_error'))

            self.gmv_montecarlo_loss = tf.reduce_mean(
                tf.squared_difference(self.montecarlo_gmv_target,
                                      self.q_eval_gmv_wrt_a,
                                      name='MonteCarlo_gmv_loss'))
            self.cost_montecarlo_loss = tf.reduce_mean(
                tf.squared_difference(self.montecarlo_cost_target,
                                      self.q_eval_cost_wrt_a,
                                      name='MonteCarlo_cost_loss'))
            self.montecarlo_loss = tf.reduce_mean(
                tf.squared_difference(self.montecarlo_target,
                                      self.q_eval_wrt_a,
                                      name='MonteCarlo_error'))

        self.gmv_td0_error = tf.abs(self.td0_q_gmv_target -
                                    self.q_eval_gmv_wrt_a)
        self.cost_td0_error = tf.abs(self.td0_q_cost_target -
                                     self.q_eval_cost_wrt_a)
        self.td0_error = tf.abs(self.td0_q_target - self.q_eval_wrt_a)

        self.gmv_doubel_dqn_error = tf.abs(self.double_dqn_gmv_target -
                                           self.q_eval_gmv_wrt_a)
        self.cost_doubel_dqn_error = tf.abs(self.double_dqn_cost_target -
                                            self.q_eval_cost_wrt_a)
        self.doubel_dqn_error = tf.abs(self.double_dqn_target -
                                       self.q_eval_wrt_a)

        self.gmv_montecarlo_error = tf.abs(self.montecarlo_gmv_target -
                                           self.q_eval_gmv_wrt_a)
        self.cost_montecarlo_error = tf.abs(self.montecarlo_cost_target -
                                            self.q_eval_cost_wrt_a)
        self.montecarlo_error = tf.abs(self.montecarlo_target -
                                       self.q_eval_wrt_a)

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(
                var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def __make_hardreplace_exp__(self, vals, target_vals):
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(var))

        expression = tf.group(*expression)
        return expression

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=1)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        new_trajectory_gmv = other_info["gmv"]
        new_trajectory_cost = other_info["cost"]
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer,
                        new_trajectory,
                        gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

        add_episode(self.gmv_replay_buffer,
                    new_trajectory_gmv,
                    gamma=self.gamma)
        add_episode(self.cost_replay_buffer,
                    new_trajectory_cost,
                    gamma=self.gamma)

    def get_action(self, sess, obs, is_test=False, other_info=None):

        if is_test:
            discrete_action = self.greedy_action(sess, obs, other_info)
        else:
            discrete_action = self.choose_action(sess, obs, other_info)
        bid_max = MultiUserEnv.bid_max
        bid_min = MultiUserEnv.bid_min
        other_action_info = {"learning_action": discrete_action}
        return bid_min + (bid_max - bid_min) / (
            self.n_actions - 1) * discrete_action, other_action_info

    def __greedy__(self, sess, observation, roi_thr):
        observations = observation[np.newaxis, :]
        actions_value = sess.run(self.q_eval,
                                 feed_dict={
                                     self.s: observations,
                                     self.roi_thr: roi_thr
                                 })
        greedy_action = np.argmax(actions_value, axis=1)[0]
        return greedy_action

    def __epsilon_greedy__(self, sess, observation, roi_thr):
        if np.random.uniform() < self.epsilon:
            action = np.random.randint(0, self.n_actions)
        else:
            action = self.__greedy__(sess, observation, roi_thr)
        return action

    def choose_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        return self.__epsilon_greedy__(sess, observation, roi_thr)

    def greedy_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        greedy_action = self.__greedy__(sess, observation, roi_thr)
        if self.use_budget_control:
            user_idx = other_info["user_idx"]
            request_idx = other_info["request_idx"]
            roi_threshold = self.get_roi_threshold()
            if request_idx == 0:
                observations = np.expand_dims(observation, axis=0)
                max_plongterm_roi = sess.run(self.plongterm_roi,
                                             feed_dict={
                                                 self.s: observations,
                                                 self.a: [greedy_action],
                                             })
                if max_plongterm_roi >= roi_threshold:
                    self.explore_user(user_idx)
                    return greedy_action
                else:
                    return 0
            else:
                if self.is_user_selected(user_idx):
                    return greedy_action
                else:
                    return 0
        else:
            return greedy_action

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def _is_exploration_enough(self, min_pool_size):
        if self.use_prioritized_experience_replay:
            return len(self.prioritized_replay_buffer) >= min_pool_size
        else:
            return len(self.replay_buffer) >= min_pool_size

    def update_target(self, sess):
        if self.softupdate:

            if self.epoch % self.soft_update_iter == 0:
                sess.run(self.update_gmv_target_q)
                sess.run(self.update_cost_target_q)
        else:

            if self.epoch % self.replace_target_iter == 0:
                sess.run(self.target_gmv_replace_op)
                sess.run(self.target_cost_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)

        self.epoch += 1

        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess)
        else:

            loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec,
                               self.epsilon_min)
            self.epsilon_dec_iter //= 1.5
            self.epsilon_dec_iter = max(self.epsilon_dec_iter,
                                        self.epsilon_dec_iter_min)
            print("update epsilon:", self.epsilon)
        return True, [
            0, 0, loss, montecarlo_loss, q_eval, returns, gmv_loss,
            gmv_montecarlo_loss, gmv_q_eval, gmv_returns, cost_loss,
            cost_montecarlo_loss, cost_q_eval, cost_returns
        ], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0
        cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(
                self.batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns, weights, ranges = self.gmv_replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index(
                sample_indices)
            _, loss, montecarlo_loss, q_eval, \
            _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \
            _2, cost_loss, cost_montecarlo_loss, cost_q_eval, \
            priority_values = sess.run(
                [self._train_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a,
                 self._train_gmv_op, self.gmv_loss, self.gmv_montecarlo_loss, self.q_eval_gmv_wrt_a,
                 self._train_cost_op, self.cost_loss, self.cost_montecarlo_loss, self.q_eval_cost_wrt_a,
                 self.priority_values],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r_gmv: rew_gmv,
                    self.r_cost: rew_cost,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_gmv_value: gmv_returns,
                    self.return_cost_value: cost_returns,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights,
                    self.roi_thr: roi_thr
                })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(
                sample_indices, priorities)
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns), \
               gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \
               cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)

    def train_normal(self, sess):
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0
        cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.batch_size)

            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index(
                sample_indices)

            _, loss, montecarlo_loss, q_eval, \
            _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \
            _2, cost_loss, cost_montecarlo_loss, cost_q_eval \
                = sess.run(
                [self._train_op, self.loss, self.montecarlo_loss, self.q_eval_wrt_a,

                 self._train_gmv_op, self.gmv_loss, self.gmv_montecarlo_loss, self.q_eval_gmv_wrt_a,
                 self._train_cost_op, self.cost_loss, self.cost_montecarlo_loss, self.q_eval_cost_wrt_a],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r_gmv: rew_gmv,
                    self.r_cost: rew_cost,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_gmv_value: gmv_returns,
                    self.return_cost_value: cost_returns,
                    self.return_value: returns,
                    self.roi_thr: roi_thr
                })
        return loss, montecarlo_loss, np.average(q_eval), np.average(returns), \
               gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \
               cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)
示例#22
0
class DDPG_interface(LearningAgent, PIDAgent):
    def __init__(
        self,
        user_num,
        action_dim,
        action_bound,
        n_features,
        init_roi,
        budget,
        use_budget_control,
        use_prioritized_experience_replay,
        max_trajectory_length,
        update_times_per_train,
    ):
        PIDAgent.__init__(self,
                          init_roi=init_roi,
                          default_alpha=1,
                          budget=budget,
                          integration=2)
        self.use_budget_control = use_budget_control
        self.user_num = user_num
        self.action_bound = action_bound
        self.action_dim = action_dim
        self.n_actions = 1
        self.n_features = n_features
        self.gamma = 1.
        self.update_times_per_train = update_times_per_train

        self.lr = 0.001

        self.epsilon = 0.9
        self.epsilon_min = 0.1
        self.epsilon_dec = 0.3
        self.epsilon_dec_iter = 100

        self.replace_target_iter = 300
        self.soft_update_iter = 1
        self.softupdate = True
        self.scope_name = "DDPG-model"

        self.epoch = 0

        self.exploration_noise = OUNoise(self.action_dim)
        self.noise_weight = 1
        self.noise_descrement_per_sampling = 0.0001

        self.buffer_size = 20000 * max_trajectory_length
        self.batch_size = 512

        self.alpha = 0.6
        self.beta = 0.4
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        if self.use_prioritized_experience_replay:
            self.prioritized_replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.alpha, max_priority=20.)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)
        self.cost_replay_buffer = ReplayBuffer(self.buffer_size,
                                               save_return=True)
        self.gmv_replay_buffer = ReplayBuffer(self.buffer_size,
                                              save_return=True)

        with tf.variable_scope(self.scope_name):

            self._build_net()

            self.build_model_saver(self.scope_name)

    def _build_net(self):

        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features],
                                 name='s_')
        self.r_gmv = tf.placeholder(tf.float32, [
            None,
        ], name='r_gmv')
        self.r_cost = tf.placeholder(tf.float32, [
            None,
        ], name='r_cost')
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')
        self.roi_thr = tf.placeholder(tf.float32, [], name="roi_thr")
        self.a = tf.placeholder(tf.float32, [
            None,
        ], name='a')
        self.done = tf.placeholder(tf.float32, [
            None,
        ], name='done')
        self.gmv_return_value = tf.placeholder(tf.float32, [
            None,
        ],
                                               name='gmv_return')
        self.cost_return_value = tf.placeholder(tf.float32, [
            None,
        ],
                                                name='cost_return')
        self.return_value = tf.placeholder(tf.float32, [
            None,
        ], name='return')
        self.important_sampling_weight_ph = tf.placeholder(
            tf.float32, [None], name="important_sampling_weight")

        self.a_eval = self._build_action_net(self.s,
                                             variable_scope="actor_eval_net")
        self.a_target = self._build_action_net(
            self.s_, variable_scope="actor_target_net")
        self.gmv_critic_eval = self._build_q_net(
            self.s, self.a, variable_scope="gmv_critic_eval_net")
        self.gmv_critic_eval_for_loss = self._build_q_net(
            self.s,
            self.a_eval,
            variable_scope="gmv_critic_eval_net",
            reuse=True)
        self.gmv_critic_target = self._build_q_net(
            self.s_, self.a_target, variable_scope="gmv_critic_target_net")

        self.cost_critic_eval = self._build_q_net(
            self.s, self.a, variable_scope="cost_critic_eval_net")
        self.cost_critic_eval_for_loss = self._build_q_net(
            self.s,
            self.a_eval,
            variable_scope="cost_critic_eval_net",
            reuse=True)
        self.cost_critic_target = self._build_q_net(
            self.s_, self.a_target, variable_scope="cost_critic_target_net")

        self.critic_eval = self.gmv_critic_eval - self.roi_thr * self.cost_critic_eval
        self.critic_eval_for_loss = self.gmv_critic_eval_for_loss - self.roi_thr * self.cost_critic_eval_for_loss
        self.critic_target = self.gmv_critic_target - self.roi_thr * self.cost_critic_target

        ae_params = scope_vars(absolute_scope_name("actor_eval_net"))
        at_params = scope_vars(absolute_scope_name("actor_target_net"))
        gmv_ce_params = scope_vars(absolute_scope_name("gmv_critic_eval_net"))
        gmv_ct_params = scope_vars(
            absolute_scope_name("gmv_critic_target_net"))
        cost_ce_params = scope_vars(
            absolute_scope_name("cost_critic_eval_net"))
        cost_ct_params = scope_vars(
            absolute_scope_name("cost_critic_target_net"))
        print(ae_params)
        print(at_params)
        print(gmv_ce_params)
        print(gmv_ct_params)
        print(cost_ce_params)
        print(cost_ct_params)

        with tf.variable_scope('hard_replacement'):
            self.a_target_replace_op = tf.group(
                [tf.assign(t, e) for t, e in zip(at_params, ae_params)])
            self.gmv_c_target_replace_op = tf.group([
                tf.assign(t, e) for t, e in zip(gmv_ct_params, gmv_ce_params)
            ])
            self.cost_c_target_replace_op = tf.group([
                tf.assign(t, e) for t, e in zip(cost_ct_params, cost_ce_params)
            ])

        with tf.variable_scope('soft_update'):
            self.a_update_target_q = self.__make_update_exp__(
                ae_params, at_params)
            self.gmv_c_update_target_q = self.__make_update_exp__(
                gmv_ce_params, gmv_ct_params)
            self.cost_c_update_target_q = self.__make_update_exp__(
                cost_ce_params, cost_ct_params)

        with tf.variable_scope('q_target'):
            self.td0_gmv_q_target = tf.stop_gradient(self.r_gmv + self.gamma *
                                                     (1. - self.done) *
                                                     self.gmv_critic_target)
            self.td0_cost_q_target = tf.stop_gradient(self.r_cost +
                                                      self.gamma *
                                                      (1. - self.done) *
                                                      self.cost_critic_target)
            self.td0_q_target = tf.stop_gradient(self.r + self.gamma *
                                                 (1. - self.done) *
                                                 self.critic_target)

            self.montecarlo_gmv_target = self.gmv_return_value
            self.montecarlo_cost_target = self.cost_return_value
            self.montecarlo_target = self.return_value

        with tf.variable_scope('loss'):
            self._build_loss()

            self._pick_loss()

        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                self.loss, var_list=gmv_ce_params + cost_ce_params)
            self._train_gmv_c_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.gmv_loss, var_list=gmv_ce_params)
            self._train_cost_c_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.cost_loss, var_list=cost_ce_params)
            self._train_a_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.actor_loss, var_list=ae_params)

        with tf.variable_scope('roi'):
            self.max_longterm_roi = self.gmv_critic_eval / (
                self.cost_critic_eval + 1e-4)

    def _pick_loss(self):

        self.has_target_net = True
        self.loss = self.td_loss
        self.gmv_loss = self.gmv_td_loss
        self.cost_loss = self.cost_td_loss
        self.actor_loss = self.a_loss
        self.priority_values = self.montecarlo_gmv_error + self.montecarlo_cost_error

    def _build_loss(self):

        if self.use_prioritized_experience_replay:

            self.gmv_td_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.td0_gmv_q_target,
                                      self.gmv_critic_eval,
                                      name='TD0_gmv_loss'))
            self.cost_td_loss = tf.reduce_mean(
                self.important_sampling_weight_ph *
                tf.squared_difference(self.td0_cost_q_target,
                                      self.cost_critic_eval,
                                      name='TD0_cost_loss'))
        else:

            self.gmv_td_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_gmv_q_target,
                                      self.gmv_critic_eval,
                                      name='TD0_gmv_loss'))
            self.cost_td_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_cost_q_target,
                                      self.cost_critic_eval,
                                      name='TD0_cost_loss'))
            self.td_loss = tf.reduce_mean(
                tf.squared_difference(self.td0_q_target,
                                      self.critic_eval,
                                      name='TD0_loss'))

        self.a_loss = -tf.reduce_mean(self.critic_eval_for_loss)

        self.gmv_montecarlo_loss = tf.reduce_mean(
            tf.squared_difference(self.montecarlo_gmv_target,
                                  self.gmv_critic_eval,
                                  name='MonteCarlo_gmv_error'))
        self.cost_montecarlo_loss = tf.reduce_mean(
            tf.squared_difference(self.montecarlo_cost_target,
                                  self.cost_critic_eval,
                                  name='MonteCarlo_cost_error'))
        self.montecarlo_loss = tf.reduce_mean(
            tf.squared_difference(self.montecarlo_target,
                                  self.critic_eval,
                                  name='MonteCarlo_error'))

        self.td0_gmv_error = tf.abs(self.td0_gmv_q_target -
                                    self.gmv_critic_eval)
        self.td0_cost_error = tf.abs(self.td0_cost_q_target -
                                     self.cost_critic_eval)
        self.td0_error = tf.abs(self.td0_q_target - self.critic_eval)

        self.montecarlo_gmv_error = tf.abs(self.montecarlo_gmv_target -
                                           self.gmv_critic_eval)
        self.montecarlo_cost_error = tf.abs(self.montecarlo_cost_target -
                                            self.cost_critic_eval)
        self.montecarlo_error = tf.abs(self.montecarlo_target -
                                       self.critic_eval)

    def _build_q_net(self, state, action, variable_scope, reuse=False):
        with tf.variable_scope(variable_scope, reuse=reuse):
            user_id_embedding_table = tf.get_variable(
                name="user_id",
                shape=[self.user_num, 20],
                initializer=initializers.xavier_initializer(),
                trainable=True,
                dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(
                user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]

            state = tf.concat(
                [state,
                 tf.expand_dims(action, axis=1, name="2d-action")],
                axis=1)
            fc1 = tf.layers.dense(state,
                                  units=n_features,
                                  activation=tf.nn.relu,
                                  name='fc1')
            fc2 = tf.layers.dense(fc1,
                                  units=n_features // 2,
                                  activation=tf.nn.relu,
                                  name='fc2')

            q = tf.layers.dense(fc2, units=self.action_dim, name='q')

            return q[:, 0]

    def _build_action_net(self, state, variable_scope):
        with tf.variable_scope(variable_scope):
            user_id_embedding_table = tf.get_variable(
                name="user_id",
                shape=[self.user_num, 20],
                initializer=initializers.xavier_initializer(),
                trainable=True,
                dtype=tf.float32)
            user_id = tf.cast(state[:, 0], dtype=tf.int32)
            user_id_embeddings = tf.nn.embedding_lookup(
                user_id_embedding_table, ids=user_id, name="user_id_embedding")
            state = tf.concat([user_id_embeddings, state[:, 1:]], axis=1)

            n_features = state.get_shape()[1]
            fc1 = tf.layers.dense(state,
                                  units=n_features // 2,
                                  activation=tf.nn.relu,
                                  name='fc1')
            actions = tf.layers.dense(fc1,
                                      self.action_dim,
                                      activation=tf.nn.sigmoid,
                                      name='a')
            scaled_a = tf.multiply(actions, 1, name='scaled_a')

            return scaled_a[:, 0]

    def __make_update_exp__(self, vals, target_vals):
        polyak = 1.0 - 1e-2
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(
                var_target.assign(polyak * var_target + (1.0 - polyak) * var))
        expression = tf.group(*expression)
        return expression

    def __make_hardreplace_exp__(self, vals, target_vals):
        expression = []
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(var_target.assign(var))

        expression = tf.group(*expression)
        return expression

    def build_model_saver(self, var_scope):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=var_scope)

        self.model_saver = tf.train.Saver(var_list=var_list, max_to_keep=3)

    def save(self, sess, path, step):
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        self.model_saver.save(sess, save_path=path, global_step=step)

    def restore(self, sess, path):
        self.model_saver.restore(sess, save_path=path)
        print('%s model reloaded from %s' % (self.scope_name, path))

    def experience(self, new_trajectory, other_info=None):
        new_trajectory_gmv = other_info["gmv"]
        new_trajectory_cost = other_info["cost"]
        if self.use_prioritized_experience_replay:
            add_episode(self.prioritized_replay_buffer,
                        new_trajectory,
                        gamma=self.gamma)
        else:
            add_episode(self.replay_buffer, new_trajectory, gamma=self.gamma)

        add_episode(self.gmv_replay_buffer,
                    new_trajectory_gmv,
                    gamma=self.gamma)
        add_episode(self.cost_replay_buffer,
                    new_trajectory_cost,
                    gamma=self.gamma)

    def __epsilon_greedy__(self, sess, observation, roi_thr):

        if np.random.uniform() < self.epsilon:
            observation = observation[np.newaxis, :]
            actions_value = sess.run(self.a_eval,
                                     feed_dict={
                                         self.s: observation,
                                         self.roi_thr: roi_thr
                                     })

            action_noise = self.exploration_noise.noise()

            bid = actions_value + action_noise

            bid = bid[0]

        else:
            bid = self.__greedy__(sess, observation, roi_thr)

        return bid

    def __greedy__(self, sess, observation, roi_thr):

        observation = observation[np.newaxis, :]

        bid = sess.run(self.a_eval,
                       feed_dict={
                           self.s: observation,
                           self.roi_thr: roi_thr
                       })

        return bid[0]

    def choose_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        return self.__epsilon_greedy__(sess, observation, roi_thr)

    def greedy_action(self, sess, observation, other_info):
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi

        bid = self.__greedy__(sess, observation, roi_thr)
        if self.use_budget_control:
            user_idx = other_info["user_idx"]
            request_idx = other_info["request_idx"]
            roi_threshold = self.get_roi_threshold()
            if request_idx == 0:
                observations = observation[np.newaxis, :]
                max_plongterm_roi = sess.run(self.max_longterm_roi,
                                             feed_dict={
                                                 self.s: observations,
                                                 self.a: [bid]
                                             })

                if max_plongterm_roi >= roi_threshold:
                    self.explore_user(user_idx)

                    return bid
                else:

                    return 0.
            else:
                if self.is_user_selected(user_idx):

                    return bid
                else:
                    return 0
        else:

            return bid

    def get_action(self, sess, obs, is_test=False, other_info=None):
        if is_test:
            discrete_action = self.greedy_action(sess, obs, other_info)
        else:
            discrete_action = self.choose_action(sess, obs, other_info)

        other_action_info = {"learning_action": discrete_action}
        return self.action_bound * np.clip(discrete_action, 0,
                                           1), other_action_info

    def get_memory_returns(self):
        if self.use_prioritized_experience_replay:
            return self.prioritized_replay_buffer.current_mean_return
        else:
            return self.replay_buffer.current_mean_return

    def _is_exploration_enough(self, min_pool_size):
        if self.use_prioritized_experience_replay:
            return len(self.prioritized_replay_buffer) >= min_pool_size
        else:
            return len(self.replay_buffer) >= min_pool_size

    def update_target(self, sess):
        if self.softupdate:

            if self.epoch % self.soft_update_iter == 0:
                sess.run(self.gmv_c_update_target_q)
                sess.run(self.cost_c_update_target_q)
                sess.run(self.a_update_target_q)
        else:

            if self.epoch % self.replace_target_iter == 0:
                sess.run(self.gmv_c_update_target_q)
                sess.run(self.cost_c_update_target_q)
                sess.run(self.a_target_replace_op)

    def train(self, sess):
        if self.has_target_net:
            self.update_target(sess)

        self.epoch += 1

        if not self._is_exploration_enough(self.batch_size):
            return False, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0

        if self.use_prioritized_experience_replay:

            policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_prioritized(sess)
        else:

            policy_loss, policy_entropy, loss, montecarlo_loss, q_eval, returns, \
            gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns, \
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = self.train_normal(sess)

        if self.epoch % self.epsilon_dec_iter == 0:
            self.epsilon = max(self.epsilon - self.epsilon_dec,
                               self.epsilon_min)
            print("update epsilon:", self.epsilon)
        return True, [
            policy_loss, policy_entropy, loss, montecarlo_loss, q_eval,
            returns, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns,
            cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns
        ], self.get_memory_returns(), self.epsilon

    def train_prioritized(self, sess):
        loss, q_eval, returns, montecarlo_loss = 0, 0, 0, 0
        for idx in range(self.update_times_per_train):
            sample_indices = self.prioritized_replay_buffer.make_index(
                self.batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns, weights, ranges = self.prioritized_replay_buffer.sample_index(
                sample_indices)
            _, loss, q_eval, montecarlo_loss, priority_values = sess.run(
                [
                    self._train_c_op, self.loss, self.critic_eval,
                    self.montecarlo_loss, self.priority_values
                ],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.return_value: returns,
                    self.important_sampling_weight_ph: weights
                })
            sess.run(self._train_a_op,
                     feed_dict={
                         self.s: obs,
                         self.a: act,
                         self.r: rew,
                         self.s_: obs_next,
                         self.done: done,
                         self.return_value: returns,
                         self.important_sampling_weight_ph: weights
                     })

            priorities = priority_values + 1e-6
            self.prioritized_replay_buffer.update_priorities(
                sample_indices, priorities)

        return loss, montecarlo_loss, np.average(q_eval), np.average(returns)

    def train_normal(self, sess):
        policy_loss, policy_entropy = 0, 0
        loss, montecarlo_loss, q_eval, returns = 0, 0, 0, 0
        gmv_loss, gmv_montecarlo_loss, gmv_q_eval, gmv_returns = 0, 0, 0, 0
        cost_loss, cost_montecarlo_loss, cost_q_eval, cost_returns = 0, 0, 0, 0
        if self.use_budget_control:
            roi_thr = self.get_roi_threshold()
        else:
            roi_thr = self.init_roi
        for idx in range(self.update_times_per_train):
            sample_indices = self.replay_buffer.make_index(self.batch_size)
            obs, act, rew, obs_next, done, dis_2_end, returns = self.replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_gmv, obs_next, done, dis_2_end, gmv_returns = self.gmv_replay_buffer.sample_index(
                sample_indices)
            obs, act, rew_cost, obs_next, done, dis_2_end, cost_returns = self.cost_replay_buffer.sample_index(
                sample_indices)

            _, loss, montecarlo_loss, q_eval, \
            _1, gmv_loss, gmv_montecarlo_loss, gmv_q_eval, \
            _2, cost_loss, cost_montecarlo_loss, cost_q_eval \
                = sess.run(
                [self._train_op, self.loss, self.montecarlo_loss, self.critic_eval,
                 self._train_gmv_c_op, self.gmv_loss, self.gmv_montecarlo_loss, self.gmv_critic_eval,
                 self._train_cost_c_op, self.cost_loss, self.cost_montecarlo_loss, self.cost_critic_eval],
                feed_dict={
                    self.s: obs,
                    self.a: act,
                    self.r_gmv: rew_gmv,
                    self.r_cost: rew_cost,
                    self.r: rew,
                    self.s_: obs_next,
                    self.done: done,
                    self.gmv_return_value: gmv_returns,
                    self.cost_return_value: cost_returns,
                    self.return_value: returns,
                    self.roi_thr: roi_thr
                })
            _, actor_loss = sess.run(
                [self._train_a_op, self.actor_loss],
                feed_dict={
                    self.roi_thr: roi_thr,
                    self.s: obs,
                    self.a: act,
                    self.r_gmv: rew_gmv,
                    self.r_cost: rew_cost,
                    self.s_: obs_next,
                    self.done: done,
                    self.gmv_return_value: gmv_returns,
                    self.cost_return_value: cost_returns,
                })

        return 0, 0, loss, montecarlo_loss, np.average(q_eval), np.average(returns), \
               gmv_loss, gmv_montecarlo_loss, np.average(gmv_q_eval), np.average(gmv_returns), \
               cost_loss, cost_montecarlo_loss, np.average(cost_q_eval), np.average(cost_returns)
示例#23
0
class DDQNAgentCnn(GeneralAgent):
    def __init__(self,
                 gamma,
                 action_number,
                 minibatch,
                 episodes,
                 begin_train,
                 copy_step,
                 epsilon_delta,
                 epsilon_start,
                 epsilon_end,
                 load_model,
                 path_to_load,
                 path_to_save,
                 plots_to_save,
                 episode_steps,
                 episode_to_save,
                 max_buffer_len,
                 model_type
                 ):

        super().__init__(gamma=gamma,
                         action_number=action_number,
                         path_to_load=path_to_load,
                         path_to_save=path_to_save,
                         plots_to_save=plots_to_save,
                         load_model=load_model,
                         episode_to_save=episode_to_save,
                         episodes=episodes,
                         model_type=model_type)
        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch

        # Episode Params

        self.begin_train = begin_train
        self.copy_step = copy_step
        self.episode_steps = episode_steps

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.target_model = model_type(action_number).to(self.device)
        self.update_target()

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []
        self.losses = []
        self.periodic_reward = 0
        self.periodic_rewards = []

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(obs):
        img = resize(rgb2gray(obs[0:188, 23:136, :]), (28, 28), mode='constant')
        img = img.reshape(1, 28, 28)
        return img

    def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state).gather(1, o_act.unsqueeze(1)).squeeze(1)
        q_next = self.target_model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q - y_hat.detach()).pow(2).mean()

        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()
        return loss

    def init_new_episode(self, env):
        observation = env.reset()
        self.state = self.preprocess_observation(observation)

    def episode_check(self, episode, loss):

        if episode % self.copy_step == 0:
            self.losses.append(loss)
            self.update_target()

        if episode % self.episode_steps == 0:
            self.periodic_rewards.append(self.periodic_reward / self.episode_steps)
            self.periodic_reward = 0

        if episode % self.episode_to_save == 0:
            torch.save(self.model.state_dict(), self.path_to_save)
            fig = plt.figure()
            plt.plot(self.rewards)
            fig.savefig(self.plots_to_save + '_reward.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.losses)
            fig.savefig(self.plots_to_save + '_loss.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.periodic_rewards)
            fig.savefig(self.plots_to_save + '_periodic_reward.png')
            plt.close(fig)
            
    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        self.init_new_episode(env)
        total_reward = 0
        episode_reward = 0
        loss = 0
        for episode in self.trangle:
            self.trangle.set_description(
                f"Episode: {episode} | Episode Reward {episode_reward} | Periodic reward "
                f"{self.periodic_reward / self.episode_steps} | Average Reward {total_reward / (episode + 1)}"
            )
            self.trangle.refresh()
            action = self.epsilon_greedy()
            next_observation, reward, done, _ = env.step(action)
            total_reward += reward
            episode_reward += reward
            self.periodic_reward += reward
            next_state = self.preprocess_observation(next_observation)
            self.replay_buffer.push(self.state, action, reward, next_state, done)
            self.state = next_state
            if len(self.replay_buffer) >= self.begin_train:
                loss = self.train_model()

            self.reduce_epsilon(episode)
            self.episode_check(episode, loss)

            if done:
                self.init_new_episode(env)
                self.rewards.append(episode_reward)
                episode_reward = 0