示例#1
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        # use epsilon greedy exploration when selecting action
        # HINT: take random action
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = (np.random.random() >
                                 (1 - eps)) or (self.t < self.learning_starts)
        if perform_random_action:
            rng = np.random.default_rng()
            action = rng.integers(0, self.env.action_space.n)
        else:
            # HINT: Your actor will take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment. Get the most recent
            # `frame_history_len` observations using functionality from the replay buffer,
            # and then use those observations as input to your actor.
            recent_obs = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(recent_obs)

        # Done take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        self.last_obs, reward, done, info = self.env.step(action)

        # Done store the result of taking this action into the replay buffer
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # Done fill in the call to the update function using the appropriate tensors
            log = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                     terminal_n)

            # TODO update the target network periodically
            # HINT: your critic already has this functionality implemented
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
示例#2
0
class ExplorationOrExploitationAgent(DQNAgent):
    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)
        
        self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)
        
        if agent_params['use_pred_error']:
            print("EXPLORATION: Using prediction error model")
            self.explore_model = "pred_error"
            self.exploration_model = PredErrorModel(agent_params, self.optimizer_spec)
        else:
            self.explore_model = "rnd"
            self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
            

        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']
        
        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']
        self.t = 0

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}

        if self.t > self.num_exploration_steps:
            # After exploration is over, set the actor to optimize the extrinsic critic
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts
                and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)
        ):
            # Get Reward Weights
            # Get the current explore reward weight and exploit reward weight
            #       using the schedule's passed in (see __init__)
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            # Evaluate the exploration model on s' to get the exploration bonus
            # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude
            # next_ob_no: shape (self.batch_size, self.ob_dim)
            # exp1_bonus: shape (self.batch_size,)
            if self.explore_model == "pred_error":
                expl_bonus = self.exploration_model.forward_np(ob_no, next_ob_no)
            else:
                expl_bonus = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = normalize(expl_bonus, np.mean(expl_bonus),
                    np.std(expl_bonus))

            # Reward Calculations #
            # Calculate mixed rewards, which will be passed into the exploration critic
            # HINT: See doc for definition of mixed_reward
            mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n

            # Calculate the environment reward
            # HINT: For part 1, env_reward is just 're_n'
            #       After this, env_reward is 're_n' shifted by self.exploit_rew_shift,
            #       and scaled by self.exploit_rew_scale
            env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #

            # TODO 1): Update the exploration model (based off s')
            # TODO 2): Update the exploration critic (based off mixed_reward)
            # TODO 3): Update the exploitation critic (based off env_reward)
            if self.explore_model == "pred_error":
                expl_model_loss = self.exploration_model.update(ob_no, next_ob_no)
            else:
                expl_model_loss = self.exploration_model.update(next_ob_no)
            exploration_critic_loss = self.exploration_critic.update(ob_no, ac_na,
                    next_ob_no, mixed_reward, terminal_n)
            exploitation_critic_loss = self.exploitation_critic.update(ob_no, ac_na,
                    next_ob_no, env_reward, terminal_n)

            # Update the Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                self.exploration_critic.update_target_network()
                self.exploitation_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss['Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss['Training Loss']
            log['Exploration Model Loss'] = expl_model_loss

            # TODO: Uncomment these lines after completing cql_critic.py
            log['Exploitation Data q-values'] = exploitation_critic_loss['Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss['OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log


    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        perform_random_action = np.random.random() < self.eps or self.t < self.learning_starts

        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            processed = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(processed)

        next_obs, reward, done, info = self.env.step(action)
        self.last_obs = next_obs.copy()

        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done)

        if done:
            self.last_obs = self.env.reset()
示例#3
0
class DQNAgent(object):
    def __init__(self, sess, env, agent_params):

        self.env = env
        self.sess = sess
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.exploration_type = agent_params['exploration_type']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(sess, agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(sess, self.critic)

        lander = agent_params['env_name'] == 'LunarLander-v2'
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        # DONE store the latest observation into the replay buffer
        # HINT: see replay buffer's function store_frame
        self.replay_buffer_idx = self.replay_buffer.store_frame(
            self.last_obs)  # DONE

        # !!!!!!!!!!!!!!!!!!!!!!!!!!NEWCODE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        if self.exploration_type == 'boltzmann':
            enc_last_obs = self.replay_buffer.encode_recent_observation(
            )  # DONE
            enc_last_obs = enc_last_obs[None, :]
            action = sample_boltzmann(
                self.sess.run(self.critic.q_t_values,
                              feed_dict={self.critic.obs_t_ph: enc_last_obs}),
                self.exploration.value(self.t))[0]
        else:
            # !!!!!!!!!!!!!!!!!!!!!!!!!!NEWCODE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

            eps = self.exploration.value(self.t)
            # DONE use epsilon greedy exploration when selecting action
            # HINT: take random action
            # with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts
            perform_random_action = self.t < self.learning_starts or np.random.random(
            ) < eps  # DONE

            if perform_random_action:
                action = int(self.num_actions * np.random.random())  # DONE
            else:
                # DONE query the policy to select action
                # HINT: you cannot use "self.last_obs" directly as input
                # into your network, since it needs to be processed to include context
                # from previous frames.
                # Check out the replay buffer, which has a function called
                # encode_recent_observation that will take the latest observation
                # that you pushed into the buffer and compute the corresponding
                # input that should be given to a Q network by appending some
                # previous frames.
                enc_last_obs = self.replay_buffer.encode_recent_observation(
                )  # DONE
                enc_last_obs = enc_last_obs[None, :]

                # DONE query the policy with enc_last_obs to select action
                action = self.actor.get_action(enc_last_obs)  # DONE
                action = action[0]

        # DONE take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        self.last_obs, reward, done, info = self.env.step(action)  # DONE

        # DONE store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)  # DONE

        # DONE if taking this step resulted in done, reset the env (and the latest observation)
        if done: self.last_obs = self.env.reset()  # DONE

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """

        loss = 0.0
        if (self.t > self.learning_starts and \
                self.t % self.learning_freq == 0 and \
                self.replay_buffer.can_sample(self.batch_size)):

            # DONE populate all placeholders necessary for calculating the critic's total_error
            # HINT: obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph
            feed_dict = {
                self.critic.learning_rate:
                self.optimizer_spec.lr_schedule.value(self.t),
                self.critic.obs_t_ph:
                ob_no,  # DONE
                self.critic.act_t_ph:
                ac_na,  # DONE
                self.critic.rew_t_ph:
                re_n,  # DONE
                self.critic.obs_tp1_ph:
                next_ob_no,  # DONE
                self.critic.done_mask_ph:
                terminal_n  # DONE
            }

            # DONE: create a LIST of tensors to run in order to
            # train the critic as well as get the resulting total_error
            tensors_to_run = [self.critic.total_error,
                              self.critic.train_fn]  # DONE
            loss, _ = self.sess.run(tensors_to_run, feed_dict=feed_dict)
            # Note: remember that the critic's total_error value is what you
            # created to compute the Bellman error in a batch,
            # and the critic's train function performs a gradient step
            # and update the network parameters to reduce that total_error.

            # DONE: use sess.run to periodically update the critic's target function
            # HINT: see update_target_fn
            if self.num_param_updates % self.target_update_freq == 0:
                self.sess.run(self.critic.update_target_fn)  # DONE

            self.num_param_updates += 1

        self.t += 1

        return loss
class DQNAgent(object):
    def __init__(self, env, agent_params):

        print(agent_params['optimizer_spec'])

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.device = agent_params['device']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic, self.device)

        lander = agent_params['env_name'] == 'LunarLander-v2'
        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):

        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation into the replay buffer
        # HINT: see replay buffer's function store_frame
        self.replay_buffer_idx = TODO

        eps = self.exploration.value(self.t)
        # TODO use epsilon greedy exploration when selecting action
        # HINT: take random action
            # with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = TODO

        if perform_random_action:
            action = TODO
        else:
            # TODO query the policy to select action
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames.
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs =
            enc_last_obs = torch.tensor(enc_last_obs[None, :]).to(self.device)

            # TODO query the policy with enc_last_obs to select action
            action = TODO

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
            #obs, reward, done, info = env.step(action)
        TODO

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        TODO

        # TODO if taking this step resulted in done, reset the env (and the latest observation), otherwise set last obs to obs
        TODO

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [],[],[],[],[]

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """
        loss = 0
        if (self.t > self.learning_starts and \
                self.t % self.learning_freq == 0 and \
                self.replay_buffer.can_sample(self.batch_size)):

            # TODO populate the parameters and implement critic.update()
            loss = self.critic.update(TODO, TODO, TODO, TODO, TODO)

            # TODO: load newest parameters into the target network
            if self.num_param_updates % self.target_update_freq == 0:
                TODO

            self.num_param_updates += 1

        self.t += 1
        return loss
示例#5
0
class DQNAgent(object):
    def __init__(self, env, agent_params, **kwargs):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()
        self.total_episode_reward = 0.0
        self.total_episodes = []
        self.episode_num = 0

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']
        self.gamma = agent_params['gamma']

        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params)
        self.q_t_loss = tf.keras.losses.Huber()
        self.q_t_optimizer = self.optimizer_spec.constructor(clipnorm=agent_params['grad_norm_clipping'],
                                                             learning_rate=self.optimizer_spec.lr_schedule,
                                                             **self.optimizer_spec.kwargs)

        self.actor = ArgMaxPolicy(self.critic)

        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         obs_dtype=agent_params['obs_dtype'])
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):

        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        eps = self.exploration(self.t)
        # TODO use epsilon greedy exploration when selecting action
        # HINT: take random action 
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = random.random() < eps

        if perform_random_action:
            action = random.randrange(self.num_actions)
        else:
            # TODO query the policy to select action
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames. 
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs = self.replay_buffer.encode_next_frame_observation(self.last_obs)[np.newaxis, ...]

            # TODO query the policy with enc_last_obs to select action
            action = self.actor.get_action(enc_last_obs).numpy().item()

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        # obs, reward, done, info = env.step(action)
        prev_obs = self.last_obs
        self.last_obs, reward, env_done, _ = self.env.step(action)
        self.total_episode_reward += reward

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_step(prev_obs, action, reward, env_done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if env_done:
            self.episode_num += 1
            print('Total episode {}: {}'.format(self.episode_num, self.total_episode_reward))
            self.last_obs = self.env.reset()
            self.total_episodes.append(self.total_episode_reward)
            self.total_episode_reward = 0.0

    def sample(self, batch_size):
        return None, None, None, None, None

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """

        loss = 0.0
        if ((self.t > self.learning_starts) and (self.t % self.learning_freq == 0) and (
        self.replay_buffer.can_sample(self.batch_size))):
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = (
                tf.convert_to_tensor(x) for x in self.replay_buffer.sample(self.batch_size))
            next_state_target_q_a = self.critic.q_t_target(next_obs_batch)

            with tf.GradientTape() as tape:
                if self.critic.double_q:
                    next_state_q_a = self.critic.q_t_model(next_obs_batch)
                    next_actions = tf.argmax(next_state_q_a, axis=1)
                else:
                    next_actions = tf.argmax(next_state_target_q_a, axis=1)
                next_state_actions_mask = tf.one_hot(next_actions, depth=self.num_actions)
                q_target = rew_batch + self.gamma * tf.reduce_sum(
                    next_state_target_q_a * next_state_actions_mask, axis=1) * (1.0 - done_mask)
                q_target = tf.stop_gradient(q_target)
                current_state_q_a = self.critic.q_t_model(obs_batch)
                pred_q = tf.reduce_sum(current_state_q_a * tf.one_hot(act_batch, depth=self.num_actions), axis=1)
                loss_value = self.q_t_loss(q_target, pred_q)

            trainable_vars = self.critic.q_t_model.trainable_variables
            grads = tape.gradient(loss_value, trainable_vars)
            self.q_t_optimizer.apply_gradients(zip(grads, trainable_vars))
            self.num_param_updates += 1
            loss = loss_value.numpy().item()

            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.q_t_target.set_weights(self.critic.q_t_model.get_weights())

        self.t += 1
        return loss
示例#6
0
class ExplorationOrExploitationAgent(DQNAgent):
    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)
        
        self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)
        
        self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']
        
        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}

        if self.t > self.num_exploration_steps:
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts
                and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)
        ):

            # Get Reward Weights
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            # explore_weight = 1
            # exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            expl_bonus = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = normalize(expl_bonus, np.mean(expl_bonus), np.std(expl_bonus))

            # Reward Calculations #
            mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n
            env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #
            expl_model_loss = self.exploration_model.update(next_ob_no)
            exploration_critic_loss = self.exploration_critic.update(ob_no, ac_na, next_ob_no,
                                                                     mixed_reward, terminal_n)
            exploitation_critic_loss = self.exploitation_critic.update(ob_no, ac_na, next_ob_no,
                                                                       env_reward, terminal_n)

            # Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                self.exploitation_critic.update_target_network()
                self.exploration_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss['Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss['Training Loss']
            log['Exploration Model Loss'] = expl_model_loss
            log['Exploitation Data q-values'] = exploitation_critic_loss['Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss['OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log


    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        perform_random_action = np.random.random() < self.eps or self.t < self.learning_starts

        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            processed = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(processed)

        next_obs, reward, done, info = self.env.step(action)
        self.last_obs = next_obs.copy()

        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done)

        if done:
            self.last_obs = self.env.reset()
示例#7
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration: PiecewiseSchedule = \
            agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander,
        )
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
        Step the env and store the transition
        At the end of this block of code, the simulator should have been
        advanced one step, and the replay buffer should contain one more
        transition. Note that self.last_obs must always point to the new latest
        observation.
        """

        # store the latest observation ("frame") into the replay buffer
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        # use epsilon greedy exploration when selecting action
        perform_random_action: bool = \
            np.random.random() < eps or self.t < self.learning_starts
        if perform_random_action:
            # take random action with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less than
            # self.learning_starts
            action = self.env.action_space.sample()
        else:
            # HINT: Your actor will take in multiple previous observations
            # ("frames") in order to deal with the partial observability of the
            # environment. Get the most recent `frame_history_len` observations
            # using functionality from the replay buffer, and then use those
            # observations as input to your actor.

            frames = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(frames)

        # take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        self.last_obs, reward, done, info = self.env.step(action)

        # store the result of taking this action into the replay buffer
        # HINT1: see your replay buffer's `store_effect` function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(
            self.replay_buffer_idx,
            action,
            reward,
            done
        )

        # if taking this step resulted in done, reset the env (and the
        # latest observation)
        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts
                and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)
        ):

            log = self.critic.update(
                ob_no,
                ac_na,
                next_ob_no,
                re_n,
                terminal_n,
            )

            # update the target network periodically
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
示例#8
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        perform_random_action = np.random.random() < eps or self.t < self.learning_starts
        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            action = self.actor.get_action(self.replay_buffer.encode_recent_observation())

        obs, rew, done, info = self.env.step(action)
        self.last_obs = obs

        self.replay_buffer.store_effect(self.replay_buffer_idx, action, rew, done)

        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            log = self.critic.update(ob_no, ac_na, re_n, next_ob_no, terminal_n)

            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log