Пример #1
0
def run_policy(env,
               get_action,
               max_ep_len=None,
               num_episodes=100,
               render=True):

    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    logger = EpochLogger()
    o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
    while n < num_episodes:
        if render:
            env.render()
            time.sleep(1e-3)

        a = get_action(o)
        o, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len))
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            n += 1

    logger.log_tabular('EpRet', with_min_and_max=True)
    logger.log_tabular('EpLen', average_only=True)
    logger.dump_tabular()
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=2000,
                 evaluation_step=1000,
                 max_ep_len=1000,
                 polyak=0.995,
                 start_steps=1000,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-3,
                 policy_delay=2,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = gym.make(env_name)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.cur_tensorboard_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.polyak = polyak
        self.gamma = gamma
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
        self.opti_pi = tf.keras.optimizers.Adam(pi_lr)

        if debug_mode:
            self.summary = tf.summary.create_file_writer(
                os.path.join(self.logger.output_dir, "logs"))

        self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit)
        self.target_actor_critic = core.ActorCritic(self.act_dim,
                                                    self.act_limit)
        self.replay_buffer = ReplayBuffer(replay_size)

        # self.critic = core.Critic()
        # net_params = self.critic.weights
        # self.target_actor_critic.set_weights(self.actor_critic.weights)
        self.target_init(self.target_actor_critic, self.actor_critic)
Пример #3
0
    def __init__(self, env_name, train_step=250000/4, evaluation_step=125000/4, max_ep_len=27000/4, epsilon_train=0.1,
                epsilon_eval=0.01, batch_size=32, replay_size=1e6,
                epsilon_decay_period=250000/4, warmup_steps=20000/4, iteration=200, gamma=0.99,
                target_update_period=8000/4, update_period=4, logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        # self.env = make_atari(env_name)
        # self.env = wrap_deepmind(self.env, frame_stack=True)
        self.env = gym.make(env_name)
        env = self.env.env
        self.env = AtariPreprocessing(env)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        self.build_model()
        self.cur_train_step = 0

        self.observation_shape = (84, 84)
        self.state_shape = (1,) + self.observation_shape + (4,)
        self.s = np.zeros(self.state_shape)
        self.last_s = np.zeros(self.state_shape)

        if debug_mode:
            self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs"))

        self.sess = tf.Session()
        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        # tf.summary.histogram("q", self.q)
        # tf.summary.histogram("q_target", self.q_target)
        # tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()
Пример #4
0
    def __init__(self, env_name, train_step=200, evaluation_step=1000, max_ep_len=200, epsilon_train=0.1,
                epsilon_eval=0.01, batch_size=32, replay_size=1e6,
                epsilon_decay_period=100, warmup_steps=0, iteration=200, gamma=0.99,
                target_update_period=50, update_period=10, logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.env = gym.make(env_name)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        self.build_model()
        self.cur_train_step = 0

        if debug_mode:
            self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs"))

        self.sess = tf.Session()
        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        tf.summary.histogram("q", self.q)
        tf.summary.histogram("q_target", self.q_target)
        tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()
Пример #5
0
    def __init__(self, env_name, port=2000, gpu=0, batch_size=100, train_step=25000, evaluation_step=3000,
                 max_ep_len=6000, epsilon_train=0.1, epsilon_eval=0.01, replay_size=100000,
                 epsilon_decay_period=25000, warmup_steps=2000, iteration=200, gamma=0.99, q_lr=0.0001,
                 target_update_period=800, update_period=4, logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=False, port=port, gpu=gpu)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        self.build_model()
        self.cur_train_step = 0
        self.cur_tensorboard = 0

        if debug_mode:
            self.summary = tf.summary.create_file_writer(os.path.join(self.logger.output_dir, "logs"))

        self.build_model()
        self.savepath = os.path.join(self.logger.output_dir, "saver")
        checkpoint = tf.train.Checkpoint(model=self.model, target_model=self.model_target)
        self.manager = tf.train.CheckpointManager(checkpoint, directory=self.savepath, max_to_keep=20, checkpoint_name="model.ckpt")
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
Пример #6
0
class EMAQ:
    def __init__(self,
                 env_fn,
                 env_name=None,
                 actor_critic=core.MLPActorCritic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=100,
                 epochs=10000,
                 replay_size=int(2000000),
                 gamma=0.99,
                 polyak=0.995,
                 lr=3e-4,
                 p_lr=3e-5,
                 alpha=0.2,
                 batch_size=100,
                 start_steps=10000,
                 update_after=1000,
                 update_every=50,
                 num_test_episodes=10,
                 max_ep_len=1000,
                 logger_kwargs=dict(),
                 save_freq=1,
                 algo='CQL'):
        """
        Soft Actor-Critic (SAC)


        Args:
            env_fn : A function which creates a copy of the environment.
                The environment must satisfy the OpenAI Gym API.

            actor_critic: The constructor method for a PyTorch Module with an ``act`` 
                method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
                The ``act`` method and ``pi`` module should accept batches of 
                observations as inputs, and ``q1`` and ``q2`` should accept a batch 
                of observations and a batch of actions as inputs. When called, 
                ``act``, ``q1``, and ``q2`` should return:

                ===========  ================  ======================================
                Call         Output Shape      Description
                ===========  ================  ======================================
                ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                            | observation.
                ``q1``       (batch,)          | Tensor containing one current estimate
                                            | of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ``q2``       (batch,)          | Tensor containing the other current 
                                            | estimate of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ===========  ================  ======================================

                Calling ``pi`` should return:

                ===========  ================  ======================================
                Symbol       Shape             Description
                ===========  ================  ======================================
                ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                            | given observations.
                ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                            | actions in ``a``. Importantly: gradients
                                            | should be able to flow back into ``a``.
                ===========  ================  ======================================

            ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
                you provided to SAC.

            seed (int): Seed for random number generators.

            steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
                for the agent and the environment in each epoch.

            epochs (int): Number of epochs to run and train agent.

            replay_size (int): Maximum length of replay buffer.

            gamma (float): Discount factor. (Always between 0 and 1.)

            polyak (float): Interpolation factor in polyak averaging for target 
                networks. Target networks are updated towards main networks 
                according to:

                .. math:: \\theta_{\\text{targ}} \\leftarrow 
                    \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

                where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
                close to 1.)

            lr (float): Learning rate (used for both policy and value learning).

            alpha (float): Entropy regularization coefficient. (Equivalent to 
                inverse of reward scale in the original SAC paper.)

            batch_size (int): Minibatch size for SGD.

            start_steps (int): Number of steps for uniform-random action selection,
                before running real policy. Helps exploration.

            update_after (int): Number of env interactions to collect before
                starting to do gradient descent updates. Ensures replay buffer
                is full enough for useful updates.

            update_every (int): Number of env interactions that should elapse
                between gradient descent updates. Note: Regardless of how long 
                you wait between updates, the ratio of env steps to gradient steps 
                is locked to 1.

            num_test_episodes (int): Number of episodes to test the deterministic
                policy at the end of each epoch.

            max_ep_len (int): Maximum length of trajectory / episode / rollout.

            logger_kwargs (dict): Keyword args for EpochLogger.

            save_freq (int): How often (in terms of gap between epochs) to save
                the current policy and value function.

            """

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env, self.test_env = env_fn(), env_fn()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.env.action_space.high[0]

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space,
                               self.env.action_space, **ac_kwargs)
        self.ac_targ = deepcopy(self.ac)
        self.gamma = gamma

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(),
                                        self.ac.q2.parameters())

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                          act_dim=self.act_dim,
                                          size=replay_size)

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        self.logger.log(
            '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
            var_counts)
        self.algo = algo

        self.lagrange_threshold = 10
        self.penalty_lr = 5e-2
        self.lamda = Variable(torch.log(torch.exp(torch.Tensor([5])) - 1),
                              requires_grad=True)
        self.lamda_optimizer = torch.optim.Adam([self.lamda],
                                                lr=self.penalty_lr)
        self.tune_lambda = True if 'lagrange' in self.algo else False

        self.alpha = 0
        self.target_update_freq = 1
        self.p_lr = 3e-5
        self.lr = 3e-4
        self.n_samples = 100
        self.env_name = env_name

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.num_test_episodes = num_test_episodes
        self.max_ep_len = max_ep_len
        self.epochs = epochs
        self.steps_per_epoch = steps_per_epoch
        self.update_after = update_after
        self.update_every = update_every
        self.batch_size = batch_size
        self.save_freq = save_freq
        self.polyak = polyak
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)
        print("Running Offline RL algorithm: {}".format(self.algo))

    def populate_replay_buffer(self):
        dataset = d4rl.qlearning_dataset(self.env)
        self.replay_buffer.obs_buf[:dataset['observations'].
                                   shape[0], :] = dataset['observations']
        self.replay_buffer.act_buf[:dataset['actions'].
                                   shape[0], :] = dataset['actions']
        self.replay_buffer.obs2_buf[:dataset['next_observations'].
                                    shape[0], :] = dataset['next_observations']
        self.replay_buffer.rew_buf[:dataset['rewards'].
                                   shape[0]] = dataset['rewards']
        self.replay_buffer.done_buf[:dataset['terminals'].
                                    shape[0]] = dataset['terminals']
        self.replay_buffer.size = dataset['observations'].shape[0]
        self.replay_buffer.ptr = (self.replay_buffer.size +
                                  1) % (self.replay_buffer.max_size)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        sampled_actions_q1 = None
        sampled_actions_q2 = None
        for i in range(self.n_samples):
            z = np.random.randn(a.shape[0], a.shape[1])
            z = torch.FloatTensor(z)
            actions, _ = self.sampling_policy.inverse(z, y=o2)
            if sampled_actions_q1 is None:
                sampled_actions_q1 = self.ac_targ.q1(o2, actions).view(-1, 1)
                sampled_actions_q2 = self.ac_targ.q2(o2, actions).view(-1, 1)
            else:
                sampled_actions_q1 = torch.cat(
                    (sampled_actions_q1, self.ac_targ.q1(o2, actions).view(
                        -1, 1)),
                    dim=1)
                sampled_actions_q2 = torch.cat(
                    (sampled_actions_q2, self.ac_targ.q2(o2, actions).view(
                        -1, 1)),
                    dim=1)

        q1 = self.ac.q1(o, a)
        q2 = self.ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)
            # Target Q-values
            q1_pi_targ = torch.max(sampled_actions_q1, dim=1).values
            q2_pi_targ = torch.max(sampled_actions_q2, dim=1).values
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + self.gamma * (1 - d) * (q_pi_targ -
                                                 self.alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    def update(self, data, update_timestep):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Record things
        self.logger.store(LossQ=loss_q.item(), **q_info)

        # Finally, update target networks by polyak averaging.
        if update_timestep % self.target_update_freq == 0:
            with torch.no_grad():
                for p, p_targ in zip(self.ac.parameters(),
                                     self.ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(self.polyak)
                    p_targ.data.add_((1 - self.polyak) * p.data)

    def get_action(self, o, deterministic=False):
        sampled_actions_q1 = None
        sampled_actions_q2 = None
        sampled_actions = []
        o = torch.FloatTensor(o).view(1, -1)
        for i in range(self.n_samples):
            z = np.random.randn(1, self.act_dim)
            z = torch.FloatTensor(z)
            actions, _ = self.sampling_policy.inverse(z, y=o)
            sampled_actions.append(actions)
            if sampled_actions_q1 is None:
                sampled_actions_q1 = self.ac.q1(o, actions).view(-1, 1)
                sampled_actions_q2 = self.ac.q2(o, actions).view(-1, 1)
            else:
                sampled_actions_q1 = torch.cat(
                    (sampled_actions_q1, self.ac.q1(o, actions).view(-1, 1)),
                    dim=1)
                sampled_actions_q2 = torch.cat(
                    (sampled_actions_q2, self.ac.q2(o, actions).view(-1, 1)),
                    dim=1)

        q_values = torch.min(sampled_actions_q1, sampled_actions_q2)
        max_idx = torch.argmax(q_values.view(-1))
        return sampled_actions[max_idx].detach().cpu().numpy()

    def test_agent(self):
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0
            while not (d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = self.test_env.step(self.get_action(o, True))
                ep_ret += r
                ep_len += 1
            self.logger.store(TestEpRet=100 *
                              self.test_env.get_normalized_score(ep_ret),
                              TestEpLen=ep_len)

    def run(self):

        # Learn a generative model for data
        # density_epochs = 50
        # self.sampling_policy = core.MADE(self.act_dim, 256, 2 , cond_label_size = self.obs_dim[0])
        # density_optimizer = torch.optim.Adam(self.sampling_policy.parameters(), lr=1e-4, weight_decay=1e-6)
        # for i in range(density_epochs):
        #     sample_indices = np.random.choice(
        #             self.replay_buffer.size, self.replay_buffer.size)
        #     np.random.shuffle(sample_indices)
        #     ctr = 0
        #     total_loss = 0
        #     for j in range(0, self.replay_buffer.size, self.batch_size):
        #         actions = self.replay_buffer.act_buf[sample_indices[ctr * self.batch_size:(
        #                 ctr + 1) * self.batch_size],:]
        #         actions = torch.FloatTensor(actions)
        #         obs = self.replay_buffer.obs_buf[sample_indices[ctr * self.batch_size:(
        #                 ctr + 1) * self.batch_size],:]
        #         obs = torch.FloatTensor(obs)
        #         density_optimizer.zero_grad()
        #         loss = -self.sampling_policy.log_prob(actions,y=obs).mean()
        #         loss.backward()
        #         total_loss+=loss.data * self.batch_size
        #         density_optimizer.step()
        #         ctr+=1

        #     print("Density training loss: {}".format(total_loss/self.replay_buffer.size))
        self.sampling_policy = core.MADE(self.act_dim,
                                         256,
                                         3,
                                         cond_label_size=self.obs_dim[0])
        self.sampling_policy.load_state_dict(
            torch.load("behavior_policies/" + self.env_name + ".pt"))
        # self.sampling_policy = torch.load("marginals/"+self.env_name+".pt")

        # Prepare for interaction with environment
        total_steps = self.epochs * self.steps_per_epoch
        start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):

            # # Update handling
            batch = self.replay_buffer.sample_batch(self.batch_size)
            self.update(data=batch, update_timestep=t)

            # End of epoch handling
            if (t + 1) % self.steps_per_epoch == 0:
                epoch = (t + 1) // self.steps_per_epoch

                # Save model
                if (epoch % self.save_freq == 0) or (epoch == self.epochs):
                    self.logger.save_state({'env': self.env}, None)

                # Test the performance of the deterministic version of the agent.
                self.test_agent()

                # Log info about epoch
                self.logger.log_tabular('Epoch', epoch)
                self.logger.log_tabular('TestEpRet', with_min_and_max=True)
                self.logger.log_tabular('TestEpLen', average_only=True)
                self.logger.log_tabular('TotalUpdates', t)
                self.logger.log_tabular('Q1Vals', with_min_and_max=True)
                self.logger.log_tabular('Q2Vals', with_min_and_max=True)
                self.logger.log_tabular('LossQ', average_only=True)
                self.logger.log_tabular('Time', time.time() - start_time)
                self.logger.dump_tabular()
Пример #7
0
class CQL:

    def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=1000, epochs=10000, replay_size=int(2e6), gamma=0.99, 
        polyak=0.995, lr=3e-4, p_lr=1e-4, alpha=0.2, batch_size=100, start_steps=10000, 
        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, 
        logger_kwargs=dict(), save_freq=1,policy_eval_start=0, algo='CQL',min_q_weight=5, automatic_alpha_tuning=False):
        """
        Soft Actor-Critic (SAC)


        Args:
            env_fn : A function which creates a copy of the environment.
                The environment must satisfy the OpenAI Gym API.

            actor_critic: The constructor method for a PyTorch Module with an ``act`` 
                method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
                The ``act`` method and ``pi`` module should accept batches of 
                observations as inputs, and ``q1`` and ``q2`` should accept a batch 
                of observations and a batch of actions as inputs. When called, 
                ``act``, ``q1``, and ``q2`` should return:

                ===========  ================  ======================================
                Call         Output Shape      Description
                ===========  ================  ======================================
                ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                            | observation.
                ``q1``       (batch,)          | Tensor containing one current estimate
                                            | of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ``q2``       (batch,)          | Tensor containing the other current 
                                            | estimate of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ===========  ================  ======================================

                Calling ``pi`` should return:

                ===========  ================  ======================================
                Symbol       Shape             Description
                ===========  ================  ======================================
                ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                            | given observations.
                ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                            | actions in ``a``. Importantly: gradients
                                            | should be able to flow back into ``a``.
                ===========  ================  ======================================

            ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
                you provided to SAC.

            seed (int): Seed for random number generators.

            steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
                for the agent and the environment in each epoch.

            epochs (int): Number of epochs to run and train agent.

            replay_size (int): Maximum length of replay buffer.

            gamma (float): Discount factor. (Always between 0 and 1.)

            polyak (float): Interpolation factor in polyak averaging for target 
                networks. Target networks are updated towards main networks 
                according to:

                .. math:: \\theta_{\\text{targ}} \\leftarrow 
                    \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

                where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
                close to 1.)

            lr (float): Learning rate (used for both policy and value learning).

            alpha (float): Entropy regularization coefficient. (Equivalent to 
                inverse of reward scale in the original SAC paper.)

            batch_size (int): Minibatch size for SGD.

            start_steps (int): Number of steps for uniform-random action selection,
                before running real policy. Helps exploration.

            update_after (int): Number of env interactions to collect before
                starting to do gradient descent updates. Ensures replay buffer
                is full enough for useful updates.

            update_every (int): Number of env interactions that should elapse
                between gradient descent updates. Note: Regardless of how long 
                you wait between updates, the ratio of env steps to gradient steps 
                is locked to 1.

            num_test_episodes (int): Number of episodes to test the deterministic
                policy at the end of each epoch.

            max_ep_len (int): Maximum length of trajectory / episode / rollout.

            logger_kwargs (dict): Keyword args for EpochLogger.

            save_freq (int): How often (in terms of gap between epochs) to save
                the current policy and value function.

            """

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env, self.test_env = env_fn(), env_fn()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.env.action_space.high[0]

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs)
        self.ac_targ = deepcopy(self.ac)
        self.gamma  = gamma


        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False
            
        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters())

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size)

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)
        self.algo = algo

        self.lagrange_threshold = 10
        self.penalty_lr = lr
        self.tune_lambda = True if 'lagrange' in self.algo else False
        if self.tune_lambda:
            print("Tuning Lambda")
            self.target_action_gap = self.lagrange_threshold
            self.log_lamda = torch.zeros(1, requires_grad=True, device=device)
            self.lamda_optimizer = torch.optim.Adam([self.log_lamda],lr=self.penalty_lr)
            self.lamda = self.log_lamda.exp()
            self.min_q_weight = 1.0
        else:            
            # self.lamda = min_q_weight
            self.min_q_weight = min_q_weight
        self.automatic_alpha_tuning = automatic_alpha_tuning
        if self.automatic_alpha_tuning is True:
            self.target_entropy = -torch.prod(torch.Tensor(self.env.action_space.shape)).item()
            self.log_alpha = torch.zeros(1, requires_grad=True, device=device)
            self.alpha_optim = Adam([self.log_alpha], lr=p_lr)
            self.alpha = self.log_alpha.exp()
        else:
            self.alpha = alpha
        # self.alpha = alpha # CWR does not require entropy in Q evaluation
        self.target_update_freq = 1
        self.p_lr = p_lr
        self.lr=lr


        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.num_test_episodes = num_test_episodes
        self.max_ep_len = max_ep_len
        self.epochs= epochs
        self.steps_per_epoch = steps_per_epoch
        self.update_after = update_after
        self.update_every = update_every
        self.batch_size = batch_size
        self.save_freq = save_freq
        self.polyak = polyak
        self.softmax = torch.nn.Softmax(dim=1)
        self.softplus = torch.nn.Softplus(beta=1, threshold=20)
        self.policy_eval_start=policy_eval_start
        self._current_epoch=0
        
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)
        print("Running Offline RL algorithm: {}".format(self.algo))


    def populate_replay_buffer(self):
        dataset = d4rl.qlearning_dataset(self.env)
        self.replay_buffer.obs_buf[:dataset['observations'].shape[0],:] = dataset['observations']
        self.replay_buffer.act_buf[:dataset['actions'].shape[0],:] = dataset['actions']
        self.replay_buffer.obs2_buf[:dataset['next_observations'].shape[0],:] = dataset['next_observations']
        self.replay_buffer.rew_buf[:dataset['rewards'].shape[0]] = dataset['rewards']
        self.replay_buffer.done_buf[:dataset['terminals'].shape[0]] = dataset['terminals']
        self.replay_buffer.size = dataset['observations'].shape[0]
        self.replay_buffer.ptr = (self.replay_buffer.size+1)%(self.replay_buffer.max_size)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = self.ac.q1(o,a)
        q2 = self.ac.q2(o,a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)

            # Target Q-values
            q1_pi_targ = self.ac_targ.q1(o2, a2)
            q2_pi_targ = self.ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2


        self.logger.store(CQLalpha=self.lamda)
        if 'rho' in self.algo:
            samples = 10
            # Sample from previous policy (10 samples)
            o_rep = o.repeat_interleave(repeats=samples,dim=0)
            sample_actions, _ = self.ac.pi(o_rep)
            cql_loss_q1 = self.ac.q1(o_rep,sample_actions).reshape(-1,1)
            cql_loss_q2 = self.ac.q2(o_rep,sample_actions).reshape(-1,1)

            cql_loss_q1 = cql_loss_q1-np.log(samples)
            cql_loss_q2 = cql_loss_q2-np.log(samples)

            cql_loss_q1 = torch.logsumexp(cql_loss_q1,dim=1).mean()*self.min_q_weight
            cql_loss_q2 = torch.logsumexp(cql_loss_q2,dim=1).mean()*self.min_q_weight
            
            # Sample from dataset
            cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight
            cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight

        else:
            samples = 10 
            q1_pi_samples = None
            q2_pi_samples = None
            # Add samples from previous policy
            o_rep = o.repeat_interleave(repeats=samples,dim=0)
            o2_rep = o2.repeat_interleave(repeats=samples,dim=0)
            # o_rep = o.repeat_interleave(samples,1)

            # Samples from current policy
            sample_action, logpi = self.ac.pi(o_rep)
            q1_pi_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach()
            q2_pi_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach()
            q1_pi_samples = q1_pi_samples.view((o.shape[0],-1))
            q2_pi_samples = q2_pi_samples.view((o.shape[0],-1))

            sample_next_action, logpi_n = self.ac.pi(o2_rep)
            q1_next_pi_samples = self.ac.q1(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach()
            q2_next_pi_samples = self.ac.q2(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach()
            q1_next_pi_samples = q1_next_pi_samples.view((o2.shape[0],-1))
            q2_next_pi_samples = q2_next_pi_samples.view((o2.shape[0],-1))


            # Add samples from uniform sampling
            sample_action = np.random.uniform(low=self.env.action_space.low,high=self.env.action_space.high,size=(q1_pi_samples.shape[0]*10,self.env.action_space.high.shape[0]))
            sample_action = torch.FloatTensor(sample_action).to(device)
            log_pi = torch.FloatTensor([np.log(1/np.prod(self.env.action_space.high-self.env.action_space.low))]).to(device)


            q1_rand_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach()
            q2_rand_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach()


            q1_rand_samples = q1_rand_samples.view((o.shape[0],-1))
            q2_rand_samples = q2_rand_samples.view((o.shape[0],-1))
            
            cql_loss_q1 = torch.logsumexp(torch.cat([q1_pi_samples,q1_next_pi_samples,q1_rand_samples],dim=1),dim=1).mean()*self.min_q_weight 
            cql_loss_q2 = torch.logsumexp(torch.cat((q2_pi_samples,q2_next_pi_samples,q2_rand_samples),dim=1),dim=1).mean()*self.min_q_weight


            
            # Sample from dataset
            cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight
            cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight

        # Update the cql-alpha
        if 'lagrange' in self.algo:
            cql_alpha = torch.clamp(self.log_lamda.exp(), min=0.0, max=1000000.0)
            self.lamda = cql_alpha.item()
            cql_loss_q1 = cql_alpha*(cql_loss_q1-self.target_action_gap)
            cql_loss_q2 = cql_alpha*(cql_loss_q2-self.target_action_gap)
            self.lamda_optimizer.zero_grad()
            lamda_loss = (-cql_loss_q1-cql_loss_q2)*0.5
            lamda_loss.backward(retain_graph=True)
            self.lamda_optimizer.step()
            # print(self.log_lamda.exp())

        avg_q = 0.5*(cql_loss_q1.mean() + cql_loss_q2.mean()).detach().cpu()
        loss_q += (cql_loss_q1.mean() + cql_loss_q2.mean())


        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy(),
                      AvgQ = avg_q)

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self,data):
        o = data['obs']
        a = data['act']
        pi, logp_pi = self.ac.pi(o)
        q1_pi = self.ac.q1(o, pi)
        q2_pi = self.ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        loss_pi = (self.alpha * logp_pi - q_pi).mean()

        # TODO: Verify if this is needed
        if self._current_epoch<self.policy_eval_start:
            policy_log_prob = self.ac.pi.get_logprob(o, a)
            loss_pi = (self.alpha * logp_pi - policy_log_prob).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info, logp_pi



    def update(self,data, update_timestep):
        self._current_epoch+=1
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()


        # Record things
        self.logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort 
        # computing gradients for them during the policy learning step.
        for p in self.q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi, pi_info, log_pi = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()


        if self.automatic_alpha_tuning:
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in self.q_params:
            p.requires_grad = True

        # Record things
        self.logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        if update_timestep%self.target_update_freq==0:
            with torch.no_grad():
                for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(self.polyak)
                    p_targ.data.add_((1 - self.polyak) * p.data)

    def get_action(self, o, deterministic=False):
        return self.ac.act(torch.as_tensor(o, dtype=torch.float32).to(device), 
                      deterministic)

    def test_agent(self):
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0
            while not(d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = self.test_env.step(self.get_action(o, True))
                ep_ret += r
                ep_len += 1
            self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len)

    def run(self):
        # Prepare for interaction with environment
        total_steps = self.epochs * self.steps_per_epoch
        start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):

            # # Update handling
            batch = self.replay_buffer.sample_batch(self.batch_size)
            self.update(data=batch, update_timestep = t)

            # End of epoch handling
            if (t+1) % self.steps_per_epoch == 0:
                epoch = (t+1) // self.steps_per_epoch

                # # Save model
                # if (epoch % self.save_freq == 0) or (epoch == self.epochs):
                #     self.logger.save_state({'env': self.env}, None)

                # Test the performance of the deterministic version of the agent.
                self.test_agent()

                # Log info about epoch
                self.logger.log_tabular('Epoch', epoch)
                self.logger.log_tabular('TestEpRet', with_min_and_max=True)
                self.logger.log_tabular('TestEpLen', average_only=True)
                self.logger.log_tabular('TotalUpdates', t)
                self.logger.log_tabular('Q1Vals', with_min_and_max=True)
                self.logger.log_tabular('Q2Vals', with_min_and_max=True)
                self.logger.log_tabular('LogPi', with_min_and_max=True)
                self.logger.log_tabular('LossPi', average_only=True)
                self.logger.log_tabular('LossQ', average_only=True)
                self.logger.log_tabular('CQLalpha', average_only=True)
                self.logger.log_tabular('Time', time.time()-start_time)
                self.logger.dump_tabular()



    def train(self, training_epochs):
        # Main loop: collect experience in env and update/log each epoch
        for t in range(training_epochs):
            # # Update handling
            batch = self.replay_buffer.sample_batch(self.batch_size)
            self.update(data=batch, update_timestep = t)

        self.test_agent()

    def collect_episodes(self, num_episodes):
        env_steps = 0
        for j in range(num_episodes):
            o, d, ep_ret, ep_len = self.env.reset(), False, 0, 0
            while not(d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time 
                act = self.get_action(o)
                no, r, d, _ = self.env.step(act)
                self.replay_buffer.store(o,act,r,no,d)
                env_steps+=1
        return env_steps       


    def log_and_dump(self):
        # Log info about epoch
        self.logger.log_tabular('TestEpRet', with_min_and_max=True)
        self.logger.log_tabular('TestEpLen', average_only=True)
        self.logger.log_tabular('Q1Vals', with_min_and_max=True)
        self.logger.log_tabular('Q2Vals', with_min_and_max=True)
        self.logger.log_tabular('LogPi', with_min_and_max=True)
        self.logger.log_tabular('LossPi', average_only=True)
        self.logger.log_tabular('LossQ', average_only=True)
        self.logger.log_tabular('CQLalpha', average_only=True)
        self.logger.dump_tabular()
Пример #8
0
class AWAC:

    def __init__(self, env_fn, actor_critic=core.MLPActorCritic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=100,
                 epochs=10000,
                 replay_size=int(2000000),
                 gamma=0.99,
                 polyak=0.995,
                 lr=3e-4,
                 p_lr=3e-4,
                 alpha=0.0,
                 batch_size=1024,
                 start_steps=10000,
                 update_after=0,
                 update_every=50,
                 num_test_episodes=10,
                 max_ep_len=1000,
                 logger_kwargs=dict(),
                 save_freq=1,
                 algo='SAC'):
        """
        Soft Actor-Critic (SAC)


        Args:
            env_fn : A function which creates a copy of the environment.
                The environment must satisfy the OpenAI Gym API.

            actor_critic: The constructor method for a PyTorch Module with an ``act`` 
                method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
                The ``act`` method and ``pi`` module should accept batches of 
                observations as inputs, and ``q1`` and ``q2`` should accept a batch 
                of observations and a batch of actions as inputs. When called, 
                ``act``, ``q1``, and ``q2`` should return:

                ===========  ================  ======================================
                Call         Output Shape      Description
                ===========  ================  ======================================
                ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                            | observation.
                ``q1``       (batch,)          | Tensor containing one current estimate
                                            | of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ``q2``       (batch,)          | Tensor containing the other current 
                                            | estimate of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ===========  ================  ======================================

                Calling ``pi`` should return:

                ===========  ================  ======================================
                Symbol       Shape             Description
                ===========  ================  ======================================
                ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                            | given observations.
                ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                            | actions in ``a``. Importantly: gradients
                                            | should be able to flow back into ``a``.
                ===========  ================  ======================================

            ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
                you provided to SAC.

            seed (int): Seed for random number generators.

            steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
                for the agent and the environment in each epoch.

            epochs (int): Number of epochs to run and train agent.

            replay_size (int): Maximum length of replay buffer.

            gamma (float): Discount factor. (Always between 0 and 1.)

            polyak (float): Interpolation factor in polyak averaging for target 
                networks. Target networks are updated towards main networks 
                according to:

                .. math:: \\theta_{\\text{targ}} \\leftarrow 
                    \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

                where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
                close to 1.)

            lr (float): Learning rate (used for both policy and value learning).

            alpha (float): Entropy regularization coefficient. (Equivalent to 
                inverse of reward scale in the original SAC paper.)

            batch_size (int): Minibatch size for SGD.

            start_steps (int): Number of steps for uniform-random action selection,
                before running real policy. Helps exploration.

            update_after (int): Number of env interactions to collect before
                starting to do gradient descent updates. Ensures replay buffer
                is full enough for useful updates.

            update_every (int): Number of env interactions that should elapse
                between gradient descent updates. Note: Regardless of how long 
                you wait between updates, the ratio of env steps to gradient steps 
                is locked to 1.

            num_test_episodes (int): Number of episodes to test the deterministic
                policy at the end of each epoch.

            max_ep_len (int): Maximum length of trajectory / episode / rollout.

            logger_kwargs (dict): Keyword args for EpochLogger.

            save_freq (int): How often (in terms of gap between epochs) to save
                the current policy and value function.

            """

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env, self.test_env = env_fn(), env_fn()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.env.action_space.high[0]

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space, self.env.action_space,
                               special_policy='awac', **ac_kwargs)
        self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space,
                                    special_policy='awac', **ac_kwargs)
        self.ac_targ.load_state_dict(self.ac.state_dict())
        self.gamma = gamma

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters())

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim,
                                          size=replay_size)

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)
        self.algo = algo

        self.p_lr = p_lr
        self.lr = lr
        self.alpha = 0
        # # Algorithm specific hyperparams

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.num_test_episodes = num_test_episodes
        self.max_ep_len = max_ep_len
        self.epochs = epochs
        self.steps_per_epoch = steps_per_epoch
        self.update_after = update_after
        self.update_every = update_every
        self.batch_size = batch_size
        self.save_freq = save_freq
        self.polyak = polyak
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)
        print("Running Offline RL algorithm: {}".format(self.algo))

    def populate_replay_buffer(self, env_name):
        data_envs = {
            'HalfCheetah-v2': (
                "awac_data/hc_action_noise_15.npy",
                "awac_data/hc_off_policy_15_demos_100.npy"),
            'Ant-v2': (
                "awac_data/ant_action_noise_15.npy",
                "awac_data/ant_off_policy_15_demos_100.npy"),
            'Walker2d-v2': (
                "awac_data/walker_action_noise_15.npy",
                "awac_data/walker_off_policy_15_demos_100.npy"),
        }
        if env_name in data_envs:
            print('Loading saved data')
            for file in data_envs[env_name]:
                if not os.path.exists(file):
                    warnings.warn(colored('Offline data not found. Follow awac_data/instructions.txt to download. Running without offline data.', 'red'))
                    break
                data = np.load(file, allow_pickle=True)
                for demo in data:
                    for transition in list(zip(demo['observations'], demo['actions'], demo['rewards'],
                                               demo['next_observations'], demo['terminals'])):
                        self.replay_buffer.store(*transition)
        else:
            dataset = d4rl.qlearning_dataset(self.env)
            N = dataset['rewards'].shape[0]
            for i in range(N):
                self.replay_buffer.store(dataset['observations'][i], dataset['actions'][i],
                                         dataset['rewards'][i], dataset['next_observations'][i],
                                         float(dataset['terminals'][i]))
            print("Loaded dataset")

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = self.ac.q1(o, a)
        q2 = self.ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)

            # Target Q-values
            q1_pi_targ = self.ac_targ.q1(o2, a2)
            q2_pi_targ = self.ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup) ** 2).mean()
        loss_q2 = ((q2 - backup) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(),
                      Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self, data):
        o = data['obs']

        pi, logp_pi = self.ac.pi(o)
        q1_pi = self.ac.q1(o, pi)
        q2_pi = self.ac.q2(o, pi)
        v_pi = torch.min(q1_pi, q2_pi)

        beta = 2
        q1_old_actions = self.ac.q1(o, data['act'])
        q2_old_actions = self.ac.q2(o, data['act'])
        q_old_actions = torch.min(q1_old_actions, q2_old_actions)

        adv_pi = q_old_actions - v_pi
        weights = F.softmax(adv_pi / beta, dim=0)
        policy_logpp = self.ac.pi.get_logprob(o, data['act'])
        loss_pi = (-policy_logpp * len(weights) * weights.detach()).mean()

        # Useful info for logging
        pi_info = dict(LogPi=policy_logpp.detach().numpy())

        return loss_pi, pi_info

    def update(self, data, update_timestep):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Record things
        self.logger.store(LossQ=loss_q.item(), **q_info)
        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in self.q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi, pi_info = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in self.q_params:
            p.requires_grad = True

        # Record things
        self.logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)

    def get_action(self, o, deterministic=False):
        return self.ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent(self):
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0
            while not (d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = self.test_env.step(self.get_action(o, True))
                ep_ret += r
                ep_len += 1
            self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)  # Get unnormalized score

            # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len)  # Get normalized score

    def run(self):
        # Prepare for interaction with environment
        total_steps = self.epochs * self.steps_per_epoch
        start_time = time.time()
        obs, ep_ret, ep_len = self.env.reset(), 0, 0
        done = True
        num_train_episodes = 0

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):

            # Reset stuff if necessary
            if done and t > 0:
                self.logger.store(ExplEpRet=ep_ret, ExplEpLen=ep_len)

                obs, ep_ret, ep_len = self.env.reset(), 0, 0
                num_train_episodes += 1

            # Collect experience
            act = self.get_action(obs, deterministic=False)
            next_obs, rew, done, info = self.env.step(act)

            self.replay_buffer.store(obs, act, rew, next_obs, done)
            obs = next_obs

            # Update handling
            if t > self.update_after and t % self.update_every == 0:
                for _ in range(self.update_every):
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    self.update(data=batch, update_timestep=t)

            # End of epoch handling
            if (t + 1) % self.steps_per_epoch == 0:
                epoch = (t + 1) // self.steps_per_epoch

                # Save model
                if (epoch % self.save_freq == 0) or (epoch == self.epochs):
                    self.logger.save_state({'env': self.env}, None)

                # Test the performance of the deterministic version of the agent.
                self.test_agent()

                # Log info about epoch
                self.logger.log_tabular('Epoch', epoch)
                self.logger.log_tabular('TestEpRet', with_min_and_max=True)
                self.logger.log_tabular('TestEpLen', average_only=True)
                self.logger.log_tabular('TotalUpdates', t)
                self.logger.log_tabular('Q1Vals', with_min_and_max=True)
                self.logger.log_tabular('Q2Vals', with_min_and_max=True)
                self.logger.log_tabular('LogPi', with_min_and_max=True)
                self.logger.log_tabular('LossPi', average_only=True)
                self.logger.log_tabular('LossQ', average_only=True)
                self.logger.log_tabular('Time', time.time() - start_time)
                self.logger.dump_tabular()
class Meta_control(object):
    def __init__(self, **kwargs):

        for key, value in kwargs.items():
            setattr(self, key, value)

        state_dim = self.env.observation_space.shape[0]
        action_dim = self.weight_dim  # self.env.action_space.shape[0]
        # print(state_dim, action_dim)
        # initialize value funciton
        self.value_Network = Value_Network(state_dim, 256, 1).to(self.device)
        self.value_net_optimizer = optim.RMSprop(
            self.value_Network.parameters(), lr=self.value_net_lr)
        # self.value_net_optimizer = optim.Adam(self.value_Network.parameters(), lr=self.value_net_lr)
        self.value_Network_target = Value_Network(state_dim, 256,
                                                  1).to(self.device)
        self.value_Network_target.load_state_dict(
            self.value_Network.state_dict())
        self.value_net_loss_func = nn.MSELoss()
        self.value_net_optimizer.zero_grad()
        # initialize Q funciton
        self.soft_Q_Network = Soft_Q_Network(state_dim + action_dim, 256,
                                             action_dim).to(self.device)
        self.soft_Q_net_optimizer = optim.RMSprop(
            self.soft_Q_Network.parameters(), lr=self.soft_Q_net_lr)
        # self.soft_Q_net_optimizer = optim.Adam(self.soft_Q_Network.parameters(), lr=self.soft_Q_net_lr)
        self.soft_Q_Network_target = Soft_Q_Network(state_dim + action_dim,
                                                    256,
                                                    action_dim).to(self.device)
        self.soft_Q_Network_target.load_state_dict(
            self.soft_Q_Network.state_dict())
        self.soft_Q_net_loss_func = nn.MSELoss()
        self.soft_Q_net_optimizer.zero_grad()
        # initialize policy network
        self.policy_Network = Policy_Network(state_dim, 256,
                                             action_dim).to(self.device)
        self.policy_net_optimizer = optim.RMSprop(
            self.policy_Network.parameters(), lr=self.policy_net_lr)
        # self.policy_net_optimizer = optim.Adam(self.policy_Network.parameters(), lr=self.policy_net_lr)
        self.policy_net_optimizer.zero_grad()
        # initialize replay buffer
        self.replay_Buffer = ReplayBuffer(self.replay_buffer_size)
        # synchronize the parameters of networks in all threads
        # sync_all_params(self.value_Network.parameters())
        # sync_all_params(self.soft_Q_Network.parameters())
        # sync_all_params(self.policy_Network.parameters())
        # sync_all_params(self.value_Network_target.parameters())
        # sync_all_params(self.soft_Q_Network_target.parameters())

    def act(self, state):
        # print(state)
        state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0).to(
            self.device)
        mean, log_std = self.policy_Network(state_tensor)
        normal_distribution = Normal(mean, log_std.exp())
        action_sample = normal_distribution.sample()
        # action_normalized = torch.softmax(action_sample, dim=0)
        action = torch.tanh(action_sample).squeeze(0).detach().cpu().numpy()
        return action

    def value_Network_backward(self, state, log_prob, soft_Q_value):
        value_predict = self.value_Network(state)
        value_label = soft_Q_value - log_prob  # self.temperature *
        value_loss = self.value_net_loss_func(value_predict,
                                              value_label.detach())
        value_loss.backward()
        # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1:
        #     average_gradients(self.value_net_optimizer.param_groups) # average the gradients of all threads

    def soft_Q_Network_backward(self, state, action, reward, nxt_state):
        soft_Q_predict = self.soft_Q_Network(state, action)
        soft_Q_label = reward + self.discount * self.value_Network_target(
            nxt_state)
        soft_Q_loss = self.soft_Q_net_loss_func(soft_Q_predict, soft_Q_label)
        # print(soft_Q_loss)
        soft_Q_loss.backward()
        # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1:
        #     average_gradients(self.soft_Q_net_optimizer.param_groups) # average the gradients of all threads

    def policy_Network_backward(self, log_prob, soft_Q_value):
        policy_loss = -torch.mean(soft_Q_value - self.temperature *
                                  log_prob)  ## -mean(Q - H) = -mean(V)
        policy_loss.backward()
        # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1:
        #     average_gradients(self.policy_net_optimizer.param_groups) # average the gradients of all threads
        # print(self.learn_times, 'ave gradients')

    def target_network_update(self, target_net, eval_net):
        for target_params, eval_params in zip(target_net.parameters(),
                                              eval_net.parameters()):
            target_params.data.copy_(target_params.data *
                                     (1.0 - self.target_update) +
                                     eval_params * self.target_update)

    def backward(self):
        if self.replay_Buffer.__len__() < self.batch_size:
            return

        state, action, reward, nxt_state = self.replay_Buffer.sample(
            self.batch_size)
        state_tensor = torch.tensor(state, dtype=torch.float).to(self.device)
        action_tensor = torch.tensor(action, dtype=torch.float).to(self.device)
        reward_tensor = torch.tensor(
            reward, dtype=torch.float).unsqueeze(1).to(self.device)
        nxt_state_tensor = torch.tensor(nxt_state,
                                        dtype=torch.float).to(self.device)

        action_sample, log_prob, _ = self.policy_Network.evaluate(state_tensor)
        # soft_Q_value = self.soft_Q_Network(state_tensor, action_sample)
        soft_Q_value = self.soft_Q_Network_target(state_tensor, action_sample)

        self.value_Network_backward(state_tensor, log_prob, soft_Q_value)
        self.soft_Q_Network_backward(state_tensor, action_tensor,
                                     reward_tensor, nxt_state_tensor)
        self.policy_Network_backward(log_prob, soft_Q_value)

        self.target_network_update(self.value_Network_target,
                                   self.value_Network)
        self.target_network_update(self.soft_Q_Network_target,
                                   self.soft_Q_Network)

    def step(self):
        self.value_net_optimizer.step()
        self.soft_Q_net_optimizer.step()
        self.policy_net_optimizer.step()
        self.value_net_optimizer.zero_grad()
        self.soft_Q_net_optimizer.zero_grad()
        self.policy_net_optimizer.zero_grad()

    def reserve_network(self, folder, episode):
        torch.save(self.value_Network.state_dict(),
                   folder + 'E' + str(episode) + '_sac_value_Network.pkl')  #
        torch.save(self.soft_Q_Network.state_dict(),
                   folder + 'E' + str(episode) + '_soft_Q_Network.pkl')  #
        torch.save(self.policy_Network.state_dict(),
                   folder + 'E' + str(episode) + '_policy_Network.pkl')  #
        # np.savetxt(folder + 'reward.txt', self.reward_buf)

    def load_network(self, folder, episode):
        self.value_Network.load_state_dict(
            torch.load(folder + 'E' + str(episode) + '_sac_value_Network.pkl'))
        self.soft_Q_Network.load_state_dict(
            torch.load(folder + 'E' + str(episode) + '_soft_Q_Network.pkl'))
        self.policy_Network.load_state_dict(
            torch.load(folder + 'E' + str(episode) + '_policy_Network.pkl'))
        self.value_Network_target.load_state_dict(
            self.value_Network.state_dict())
        self.soft_Q_Network_target.load_state_dict(
            self.soft_Q_Network.state_dict())
        # for target_params, eval_params in zip(self.soft_Q_Network_target.parameters(), self.soft_Q_Network.parameters()):
        #     target_params.data.copy_(eval_params)
        # for target_params, eval_params in zip(self.value_Network_target.parameters(), self.value_Network.parameters()):
        #     target_params.data.copy_(eval_params)
        # self.reward_buf = list(np.loadtxt(folder + 'reward.txt'))

    # def sync_multi_thread(self):
    #     # synchronize the parameters of networks in all threads
    #     sync_all_params(self.value_Network.parameters())
    #     sync_all_params(self.soft_Q_Network.parameters())
    #     sync_all_params(self.policy_Network.parameters())
    #     sync_all_params(self.value_Network_target.parameters())
    #     sync_all_params(self.soft_Q_Network_target.parameters())

    def logger_setup(self, logger_kwargs, **kwargs):
        self.logger = EpochLogger(**logger_kwargs)
        for key, value in kwargs.items():
            if key != 'env' and key != 'output_dir':
                self.logger.log_tabular(key, value)

    def logger_update(self, kwargs):
        for key, value in kwargs.items():
            self.logger.log_tabular(key, value)
        self.logger.dump_tabular()
Пример #10
0
class Dqn:
    def __init__(self, env_name, train_step=200, evaluation_step=1000, max_ep_len=200, epsilon_train=0.1,
                epsilon_eval=0.01, batch_size=32, replay_size=1e6,
                epsilon_decay_period=100, warmup_steps=0, iteration=200, gamma=0.99,
                target_update_period=50, update_period=10, logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.env = gym.make(env_name)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        self.build_model()
        self.cur_train_step = 0

        if debug_mode:
            self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs"))

        self.sess = tf.Session()
        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        tf.summary.histogram("q", self.q)
        tf.summary.histogram("q_target", self.q_target)
        tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()



    def build_model(self):
        self.input_shape = self.env.observation_space.shape
        self.model, self.model_target = mlp_dqn(self.env.action_space.n, self.input_shape)
        self.model_target.set_weights(self.model.get_weights())

    def choose_action(self, s, eval_mode=False):
        epsilon = self.epsilon_eval if eval_mode \
            else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train)
        # print("epsilon:", epsilon)
        if random.random() <= 1-epsilon:
            q = self.model.predict(s[np.newaxis, :])
            a = np.argmax(q, axis=1)[0]
            # print()
        else:
            a = self.env.action_space.sample()

        return a

    def run_one_phrase(self, min_step, eval_mode=False):
        step = 0
        episode = 0
        reward = 0.

        while step < min_step:
            reward_episode = 0.
            step_episode = 0
            done = False
            obs = self.env.reset()
            # o = np.array(obs)

            while not done:
                a = self.choose_action(np.array(obs), eval_mode)
                obs_, r, done, _ = self.env.step(a)

                step += 1
                step_episode += 1
                reward += r
                reward_episode += r

                if not eval_mode:
                    self.cur_train_step += 1
                    self.replay_buffer.add(np.array(obs), a, np.array(obs_), r, done)

                    if self.cur_train_step > 100:
                        if self.cur_train_step % self.update_period == 0:
                            # data = self.replay_buffer.sample()
                            (s, a, s_, r, d) = self.replay_buffer.sample()
                            target_q = self.model_target.predict(s_)
                            q_ = np.max(target_q, axis=1)

                            q_target = r + (1-d) *self.gamma * q_

                            q = self.model.predict(s)
                            ori_q = np.copy(q)
                            batch_index = np.arange(self.batch_size)
                            q[batch_index, a] = q_target

                            result = self.model.train_on_batch(np.array(s), q)

                            if debug_mode:
                                merge = self.sess.run(self.merge,
                                                      feed_dict={self.loss: result[0], self.q: ori_q, self.q_target: q,
                                                                 self.target_q: target_q})
                                self.summary.add_summary(merge, (self.cur_train_step-100)/self.update_period)
                            # print("result:", result)

                        if self.cur_train_step % self.target_update_period == 0:
                            self.model_target.set_weights(self.model.get_weights())

                if step_episode >= self.max_ep_len:
                    break
                obs = obs_

            episode += 1

            # print("ep:", episode, "step:", step, "r:", reward)
            if not eval_mode:
                self.logger.store(step=step_episode, reward=reward_episode)

        return reward, episode

    def train_test(self):
        for i in range(self.iteration):
            print("iter:", i+1)
            self.logger.store(iter=i+1)
            reward, episode = self.run_one_phrase(self.train_step)
            print("reward:", reward/episode, "episode:", episode)

            self.logger.log_tabular("iter", i+1)
            self.logger.log_tabular("reward", with_min_and_max=True)
            self.logger.log_tabular("step", with_min_and_max=True)
            self.logger.dump_tabular()

            reward, episode = self.run_one_phrase(self.evaluation_step, True)
            print("reward:", reward / episode, "episode:", episode)
Пример #11
0
class Dqn:
    def __init__(self, env_name, train_step=250000/4, evaluation_step=125000/4, max_ep_len=27000/4, epsilon_train=0.1,
                epsilon_eval=0.01, batch_size=32, replay_size=1e6,
                epsilon_decay_period=250000/4, warmup_steps=20000/4, iteration=200, gamma=0.99,
                target_update_period=8000/4, update_period=4, logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        # self.env = make_atari(env_name)
        # self.env = wrap_deepmind(self.env, frame_stack=True)
        self.env = gym.make(env_name)
        env = self.env.env
        self.env = AtariPreprocessing(env)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        self.build_model()
        self.cur_train_step = 0

        self.observation_shape = (84, 84)
        self.state_shape = (1,) + self.observation_shape + (4,)
        self.s = np.zeros(self.state_shape)
        self.last_s = np.zeros(self.state_shape)

        if debug_mode:
            self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs"))

        self.sess = tf.Session()
        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        # tf.summary.histogram("q", self.q)
        # tf.summary.histogram("q_target", self.q_target)
        # tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()

    def build_model(self):
        self.model, self.model_target = nature_dqn(self.env.action_space.n)
        self.model_target.set_weights(self.model.get_weights())

    def choose_action(self, s, eval_mode=False):
        epsilon = self.epsilon_eval if eval_mode \
            else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train)

        if random.random() <= 1-epsilon:
            q = self.model.predict(s[np.newaxis, :])
            a = np.argmax(q, axis=1)[0]
            # print()
        else:
            a = self.env.action_space.sample()

        return a

    def record_obs(self, observation):
        self.last_s = copy.copy(self.s)
        self.s = np.roll(self.s, -1, axis=-1)
        self.s[0, ..., -1] = np.squeeze(observation)

    def store(self, s, a, s_, r, done):
        pass

    def run_one_phrase(self, min_step, eval_mode=False):
        step = 0
        episode = 0
        reward = 0.

        while step < min_step:
            done = False
            obs = self.env.reset()
            # o = np.array(obs)

            step_episode = 0
            reward_episode = 0
            while not done:
                a = self.choose_action(np.array(obs), eval_mode)
                obs_, r, done, _ = self.env.step(a)

                step += 1
                step_episode += 1
                reward += r
                reward_episode += r
                
                if not eval_mode:
                    self.cur_train_step += 1
                    self.replay_buffer.add(np.array(obs), a, np.array(obs_), r, done)

                    if self.cur_train_step > 20000/4:
                        if self.cur_train_step % self.update_period == 0:
                            # data = self.replay_buffer.sample()
                            (s, a, s_, r, d) = self.replay_buffer.sample()
                            q_ = np.max(self.model_target.predict(s_), axis=1)
                            q_target = r + (1-d)*self.gamma * q_
                            q = self.model.predict(s)
                            batch_index = np.arange(self.batch_size)
                            q[batch_index, a] = q_target
                            result = self.model.train_on_batch(np.array(s), q)
                            # print("result:", result)

                            merge = self.sess.run(self.merge, feed_dict={self.loss: result[0]})
                            self.summary.add_summary(merge, (self.cur_train_step-20000)/self.update_period)

                        if self.cur_train_step % self.target_update_period == 0:
                            self.model_target.set_weights(self.model.get_weights())

                if step_episode >= self.max_ep_len:
                    break
                obs = obs_

            episode += 1

            # print("ep:", episode, "step:", step, "r:", reward)
            self.logger.store(step=step_episode, reward=reward_episode)

        return reward, episode

    def train_test(self):
        for i in range(self.iteration):
            print("iter:", i+1)
            self.logger.store(iter=i+1)
            reward, episode = self.run_one_phrase(self.train_step)
            print("reward:", reward/episode, "episode:", episode)

            self.logger.log_tabular("reward", with_min_and_max=True)
            self.logger.log_tabular("step", with_min_and_max=True)
            self.logger.dump_tabular()

            reward, episode = self.run_one_phrase(self.evaluation_step, True)
            print("reward:", reward / episode, "episode:", episode)
Пример #12
0
def valor(args):
    if not hasattr(args, "get"):
        args.get = args.__dict__.get
    env_fn = args.get('env_fn', lambda: gym.make('HalfCheetah-v2'))
    actor_critic = args.get('actor_critic', ActorCritic)
    ac_kwargs = args.get('ac_kwargs', {})
    disc = args.get('disc', Discriminator)
    dc_kwargs = args.get('dc_kwargs', {})
    seed = args.get('seed', 0)
    episodes_per_epoch = args.get('episodes_per_epoch', 40)
    epochs = args.get('epochs', 50)
    gamma = args.get('gamma', 0.99)
    pi_lr = args.get('pi_lr', 3e-4)
    vf_lr = args.get('vf_lr', 1e-3)
    dc_lr = args.get('dc_lr', 2e-3)
    train_v_iters = args.get('train_v_iters', 80)
    train_dc_iters = args.get('train_dc_iters', 50)
    train_dc_interv = args.get('train_dc_interv', 2)
    lam = args.get('lam', 0.97)
    max_ep_len = args.get('max_ep_len', 1000)
    logger_kwargs = args.get('logger_kwargs', {})
    context_dim = args.get('context_dim', 4)
    max_context_dim = args.get('max_context_dim', 64)
    save_freq = args.get('save_freq', 10)
    k = args.get('k', 1)

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    ac_kwargs['action_space'] = env.action_space

    # Model
    actor_critic = actor_critic(input_dim=obs_dim[0] + max_context_dim,
                                **ac_kwargs)
    disc = disc(input_dim=obs_dim[0], context_dim=max_context_dim, **dc_kwargs)

    # Buffer
    local_episodes_per_epoch = episodes_per_epoch  # int(episodes_per_epoch / num_procs())
    buffer = Buffer(max_context_dim, obs_dim[0], act_dim[0],
                    local_episodes_per_epoch, max_ep_len, train_dc_interv)

    # Count variables
    var_counts = tuple(
        count_vars(module)
        for module in [actor_critic.policy, actor_critic.value_f, disc.policy])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n' %
               var_counts)

    # Optimizers
    #Optimizer for RL Policy
    train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr)

    #Optimizer for value function (for actor-critic)
    train_v = torch.optim.Adam(actor_critic.value_f.parameters(), lr=vf_lr)

    #Optimizer for decoder
    train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr)

    #pdb.set_trace()

    # Parameters Sync
    #sync_all_params(actor_critic.parameters())
    #sync_all_params(disc.parameters())
    '''
    Training function
    '''
    def update(e):
        obs, act, adv, pos, ret, logp_old = [
            torch.Tensor(x) for x in buffer.retrieve_all()
        ]

        # Policy
        #pdb.set_trace()
        _, logp, _ = actor_critic.policy(obs, act, batch=False)
        #pdb.set_trace()
        entropy = (-logp).mean()

        # Policy loss
        pi_loss = -(logp * (k * adv + pos)).mean()

        # Train policy (Go through policy update)
        train_pi.zero_grad()
        pi_loss.backward()
        # average_gradients(train_pi.param_groups)
        train_pi.step()

        # Value function
        v = actor_critic.value_f(obs)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            v = actor_critic.value_f(obs)
            v_loss = F.mse_loss(v, ret)

            # Value function train
            train_v.zero_grad()
            v_loss.backward()
            # average_gradients(train_v.param_groups)
            train_v.step()

        # Discriminator
        if (e + 1) % train_dc_interv == 0:
            print('Discriminator Update!')
            con, s_diff = [torch.Tensor(x) for x in buffer.retrieve_dc_buff()]
            _, logp_dc, _ = disc(s_diff, con)
            d_l_old = -logp_dc.mean()

            # Discriminator train
            for _ in range(train_dc_iters):
                _, logp_dc, _ = disc(s_diff, con)
                d_loss = -logp_dc.mean()
                train_dc.zero_grad()
                d_loss.backward()
                # average_gradients(train_dc.param_groups)
                train_dc.step()

            _, logp_dc, _ = disc(s_diff, con)
            dc_l_new = -logp_dc.mean()
        else:
            d_l_old = 0
            dc_l_new = 0

        # Log the changes
        _, logp, _, v = actor_critic(obs, act)
        pi_l_new = -(logp * (k * adv + pos)).mean()
        v_l_new = F.mse_loss(v, ret)
        kl = (logp_old - logp).mean()
        logger.store(LossPi=pi_loss,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=entropy,
                     DeltaLossPi=(pi_l_new - pi_loss),
                     DeltaLossV=(v_l_new - v_l_old),
                     LossDC=d_l_old,
                     DeltaLossDC=(dc_l_new - d_l_old))
        # logger.store(Adv=adv.reshape(-1).numpy().tolist(), Pos=pos.reshape(-1).numpy().tolist())

    start_time = time.time()
    #Resets observations, rewards, done boolean
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    #Creates context distribution where each logit is equal to one (This is first place to make change)
    context_dim_prob_dict = {
        i: 1 / context_dim if i < context_dim else 0
        for i in range(max_context_dim)
    }
    last_phi_dict = {i: 0 for i in range(context_dim)}
    context_dist = Categorical(
        probs=torch.Tensor(list(context_dim_prob_dict.values())))
    total_t = 0

    for epoch in range(epochs):
        #Sets actor critic and decoder (discriminator) into eval mode
        actor_critic.eval()
        disc.eval()

        #Runs the policy local_episodes_per_epoch before updating the policy
        for index in range(local_episodes_per_epoch):
            # Sample from context distribution and one-hot encode it (Step 2)
            # Every time we run the policy we sample a new context

            c = context_dist.sample()
            c_onehot = F.one_hot(c, max_context_dim).squeeze().float()
            for _ in range(max_ep_len):
                concat_obs = torch.cat(
                    [torch.Tensor(o.reshape(1, -1)),
                     c_onehot.reshape(1, -1)], 1)
                '''
                Feeds in observation and context into actor_critic which spits out a distribution 
                Label is a sample from the observation
                pi is the action sampled
                logp is the log probability of some other action a
                logp_pi is the log probability of pi 
                v_t is the value function
                '''
                a, _, logp_t, v_t = actor_critic(concat_obs)

                #Stores context and all other info about the state in the buffer
                buffer.store(c,
                             concat_obs.squeeze().detach().numpy(),
                             a.detach().numpy(), r, v_t.item(),
                             logp_t.detach().numpy())
                logger.store(VVals=v_t)

                o, r, d, _ = env.step(a.detach().numpy()[0])
                ep_ret += r
                ep_len += 1
                total_t += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal:
                    # Key stuff with discriminator
                    dc_diff = torch.Tensor(buffer.calc_diff()).unsqueeze(0)
                    #Context
                    con = torch.Tensor([float(c)]).unsqueeze(0)
                    #Feed in differences between each state in your trajectory and a specific context
                    #Here, this is just the log probability of the label it thinks it is
                    _, _, log_p = disc(dc_diff, con)
                    buffer.end_episode(log_p.detach().numpy())
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, [actor_critic, disc], None)

        # Sets actor_critic and discriminator into training mode
        actor_critic.train()
        disc.train()

        update(epoch)
        #Need to implement curriculum learning here to update context distribution
        ''' 
            #Psuedocode:
            Loop through each of d episodes taken in local_episodes_per_epoch and check log probability from discrimantor
            If >= 0.86, increase k in the following manner: k = min(int(1.5*k + 1), Kmax)
            Kmax = 64
        '''

        decoder_accs = []
        stag_num = 10
        stag_pct = 0.05

        if (epoch + 1) % train_dc_interv == 0 and epoch > 0:
            #pdb.set_trace()
            con, s_diff = [torch.Tensor(x) for x in buffer.retrieve_dc_buff()]
            print("Context: ", con)
            print("num_contexts", len(con))
            _, logp_dc, _ = disc(s_diff, con)
            log_p_context_sample = logp_dc.mean().detach().numpy()

            print("Log Probability context sample", log_p_context_sample)

            decoder_accuracy = np.exp(log_p_context_sample)
            print("Decoder Accuracy", decoder_accuracy)

            logger.store(LogProbabilityContext=log_p_context_sample,
                         DecoderAccuracy=decoder_accuracy)
            '''
            Create score (phi(i)) = -log_p_context_sample.mean() for each specific context 
            Assign phis to each specific context
            Get p(i) in the following manner: (phi(i) + epsilon)
            Get Probabilities by doing p(i)/sum of all p(i)'s 
            '''
            logp_np = logp_dc.detach().numpy()
            con_np = con.detach().numpy()
            phi_dict = {i: 0 for i in range(context_dim)}
            count_dict = {i: 0 for i in range(context_dim)}
            for i in range(len(logp_np)):
                current_con = con_np[i]
                phi_dict[current_con] += logp_np[i]
                count_dict[current_con] += 1
            print(phi_dict)

            phi_dict = {
                k: last_phi_dict[k] if count_dict[k] == 0 else
                (-1) * v / count_dict[k]
                for (k, v) in phi_dict.items()
            }
            sorted_dict = dict(
                sorted(phi_dict.items(),
                       key=lambda item: item[1],
                       reverse=True))
            sorted_dict_keys = list(sorted_dict.keys())
            rank_dict = {
                sorted_dict_keys[i]: 1 / (i + 1)
                for i in range(len(sorted_dict_keys))
            }
            rank_dict_sum = sum(list(rank_dict.values()))
            context_dim_prob_dict = {
                k: rank_dict[k] / rank_dict_sum if k < context_dim else 0
                for k in context_dim_prob_dict.keys()
            }
            print(context_dim_prob_dict)

            decoder_accs.append(decoder_accuracy)
            stagnated = (len(decoder_accs) > stag_num
                         and (decoder_accs[-stag_num - 1] - decoder_accuracy) /
                         stag_num < stag_pct)
            if stagnated:
                new_context_dim = max(int(0.75 * context_dim), 5)
            elif decoder_accuracy >= 0.86:
                new_context_dim = min(int(1.5 * context_dim + 1),
                                      max_context_dim)
            if stagnated or decoder_accuracy >= 0.86:
                print("new_context_dim: ", new_context_dim)
                new_context_prob_arr = np.array(
                    new_context_dim * [1 / new_context_dim] +
                    (max_context_dim - new_context_dim) * [0])
                context_dist = Categorical(
                    probs=ptu.from_numpy(new_context_prob_arr))
                context_dim = new_context_dim

            for i in range(context_dim):
                if i in phi_dict:
                    last_phi_dict[i] = phi_dict[i]
                elif i not in last_phi_dict:
                    last_phi_dict[i] = max(phi_dict.values())

            buffer.clear_dc_buff()
        else:
            logger.store(LogProbabilityContext=0, DecoderAccuracy=0)

        # Log
        logger.store(ContextDim=context_dim)
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', total_t)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('LossDC', average_only=True)
        logger.log_tabular('DeltaLossDC', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('LogProbabilityContext', average_only=True)
        logger.log_tabular('DecoderAccuracy', average_only=True)
        logger.log_tabular('ContextDim', average_only=True)
        logger.dump_tabular()
Пример #13
0
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4,
        vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=10):
    """
    Vanilla Policy Gradient 

    (with GAE-Lambda for advantage estimation)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k:v for k,v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=ent, 
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

            o2, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
Пример #14
0
def ddpg(env_fn,
         env_name,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         act_noise=0.1,
         num_test_episodes=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q = ac.q(o, a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            episode_rewards.append(ep_ret)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    rewards_log = []
    episode_rewards = deque(maxlen=10)

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            # if (epoch % save_freq == 0) or (epoch == epochs):
            #     logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            rewards_log.append(np.mean(episode_rewards))

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

    rewards_log = np.array(rewards_log)
    save_path = '../../log/ddpg/' + env_name + '/' + str(seed) + '.npy'
    np.save(save_path, rewards_log)
Пример #15
0
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=10000,
                 evaluation_step=3000,
                 max_ep_len=300,
                 polyak=0.995,
                 start_steps=200,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-3,
                 policy_delay=2,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=True,
                            port=port,
                            gpu=gpu,
                            discrete_control=False)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.cur_tensorboard_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.polyak = polyak
        self.gamma = gamma
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
        self.opti_pi = tf.keras.optimizers.Adam(pi_lr)

        if debug_mode:
            self.summary = tf.summary.create_file_writer(
                os.path.join(self.logger.output_dir, "logs"))

        self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit)
        self.target_actor_critic = core.ActorCritic(self.act_dim,
                                                    self.act_limit)
        self.replay_buffer = ReplayBuffer(replay_size)

        self.loadpath = os.path.join(
            DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first")
        actor = core.ActorCnn()
        load_check = tf.train.Checkpoint(model=actor)
        load_check.restore(os.path.join(self.loadpath, "model.ckpt-200"))

        # with tf.GradientTape() as tape:
        #     x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
        #     x = tf.expand_dims(x, axis=0)
        #     a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim])
        #     a = tf.expand_dims(a, axis=0)
        #     self.actor_critic([x,a])
        #     self.actor_critic.choose_action(x)
        #     self.target_actor_critic([x,a])
        #     self.target_actor_critic.choose_action(x)
        with tf.GradientTape() as tape:
            img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
            img = tf.expand_dims(img, axis=0)
            speed = tf.random.uniform(minval=0, maxval=1, shape=(1, ))
            speed = tf.expand_dims(speed, axis=0)
            self.actor_critic.actor([img, speed])
            self.target_actor_critic.actor([img, speed])
            actor([img, speed])
        for old_var, var in zip(actor.variables, self.actor_critic.variables):
            var.assign(old_var)
        var = self.actor_critic.actor.trainable_variables
        old_var = actor.trainable_variables

        self.target_init(self.target_actor_critic, self.actor_critic)

        self.savepath = os.path.join(self.logger.output_dir, "saver")
        checkpoint = tf.train.Checkpoint(model=self.actor_critic,
                                         target_model=self.target_actor_critic)
        self.manager = tf.train.CheckpointManager(checkpoint,
                                                  directory=self.savepath,
                                                  max_to_keep=20,
                                                  checkpoint_name="model.ckpt")
Пример #16
0
class Td3:
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 train_step=10000,
                 evaluation_step=3000,
                 max_ep_len=300,
                 polyak=0.995,
                 start_steps=200,
                 batch_size=100,
                 replay_size=50000,
                 iteration=200,
                 gamma=0.99,
                 act_noise=0.1,
                 target_noise=0.2,
                 noise_clip=0.5,
                 pi_lr=1e-4,
                 q_lr=1e-3,
                 policy_delay=2,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.iteration = iteration
        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=True,
                            port=port,
                            gpu=gpu,
                            discrete_control=False)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]
        self.start_steps = start_steps
        self.cur_train_step = 0
        self.cur_tensorboard_step = 0
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.act_limit = self.env.action_space.high[0]
        self.act_noise = act_noise
        self.target_noise = target_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.polyak = polyak
        self.gamma = gamma
        self.opti_q = tf.keras.optimizers.Adam(q_lr)
        self.opti_pi = tf.keras.optimizers.Adam(pi_lr)

        if debug_mode:
            self.summary = tf.summary.create_file_writer(
                os.path.join(self.logger.output_dir, "logs"))

        self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit)
        self.target_actor_critic = core.ActorCritic(self.act_dim,
                                                    self.act_limit)
        self.replay_buffer = ReplayBuffer(replay_size)

        self.loadpath = os.path.join(
            DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first")
        actor = core.ActorCnn()
        load_check = tf.train.Checkpoint(model=actor)
        load_check.restore(os.path.join(self.loadpath, "model.ckpt-200"))

        # with tf.GradientTape() as tape:
        #     x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
        #     x = tf.expand_dims(x, axis=0)
        #     a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim])
        #     a = tf.expand_dims(a, axis=0)
        #     self.actor_critic([x,a])
        #     self.actor_critic.choose_action(x)
        #     self.target_actor_critic([x,a])
        #     self.target_actor_critic.choose_action(x)
        with tf.GradientTape() as tape:
            img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim)
            img = tf.expand_dims(img, axis=0)
            speed = tf.random.uniform(minval=0, maxval=1, shape=(1, ))
            speed = tf.expand_dims(speed, axis=0)
            self.actor_critic.actor([img, speed])
            self.target_actor_critic.actor([img, speed])
            actor([img, speed])
        for old_var, var in zip(actor.variables, self.actor_critic.variables):
            var.assign(old_var)
        var = self.actor_critic.actor.trainable_variables
        old_var = actor.trainable_variables

        self.target_init(self.target_actor_critic, self.actor_critic)

        self.savepath = os.path.join(self.logger.output_dir, "saver")
        checkpoint = tf.train.Checkpoint(model=self.actor_critic,
                                         target_model=self.target_actor_critic)
        self.manager = tf.train.CheckpointManager(checkpoint,
                                                  directory=self.savepath,
                                                  max_to_keep=20,
                                                  checkpoint_name="model.ckpt")

    def get_action(self, o, noise_scale, eval_mode=False):
        img = o["img"].astype(np.float32) / 255.0
        speed = np.array([o["speed"]])
        direction = o["direction"]

        a_list, z = self.actor_critic.select_action(
            [img[np.newaxis, :], speed[np.newaxis, :]])
        a = tf.squeeze(a_list[direction], axis=0)  # [act_dim]
        # print("----ori:" + str(a))
        if not eval_mode:
            a += noise_scale * np.random.randn(self.act_dim)
        return np.clip(a, -self.act_limit, self.act_limit)

    def target_init(self, target_net, net):
        for target_params, params in zip(target_net.trainable_variables,
                                         net.trainable_variables):
            target_params.assign(params)

    def target_update(self, target_net, net):
        for target_params, params in zip(target_net.trainable_variables,
                                         net.trainable_variables):
            target_params.assign(self.polyak * target_params +
                                 (1 - self.polyak) * params)

    def train_q(self, batch):
        with tf.GradientTape() as tape:
            img1 = batch['obs1']["img"]
            speed1 = batch['obs1']["speed"]
            direction1 = batch['obs1']["direction"]
            direction1 = tf.stack([tf.range(self.batch_size), direction1],
                                  axis=1)  # [None, 2]

            img2 = batch["obs2"]["img"]
            speed2 = batch["obs2"]["speed"]
            direction2 = batch["obs2"]["direction"]
            direction2 = tf.stack([tf.range(self.batch_size), direction2],
                                  axis=1)  # [None, 2]

            q1_list, q2_list = self.actor_critic([img1, speed1, batch["acts"]])
            q1_list = tf.stack(q1_list, axis=1)  # [None, 4, 1]
            q2_list = tf.stack(q2_list, axis=1)  # [None, 4, 1]
            q1 = tf.gather_nd(q1_list, direction1)  # [None, 1]
            q2 = tf.gather_nd(q2_list, direction1)

            pi_targ_list, z = self.target_actor_critic.select_action(
                [img2, speed2])
            pi_targ_list = tf.stack(pi_targ_list[0:4], axis=1)  # [None, 4, 3]
            pi_targ = tf.gather_nd(pi_targ_list, direction2)  # [None, 3]

            epsilon = tf.random.normal(tf.shape(pi_targ),
                                       stddev=self.target_noise)
            epsilon = tf.clip_by_value(epsilon, -self.noise_clip,
                                       self.noise_clip)
            a2 = pi_targ + epsilon
            a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit)

            q1_targ_list, q2_targ_list = self.target_actor_critic(
                [img2, speed2, a2])
            q1_targ_list = tf.stack(q1_targ_list, axis=1)  # [None, 4, 1]
            q2_targ_list = tf.stack(q2_targ_list, axis=1)  # [None, 4, 1]
            q1_targ = tf.gather_nd(q1_targ_list, direction2)  # [None, 1]
            q2_targ = tf.gather_nd(q2_targ_list, direction2)
            min_q_targ = tf.minimum(q1_targ, q2_targ)
            backup = batch['rews'] + self.gamma * (1 -
                                                   batch['done']) * min_q_targ

            q1_loss = tf.reduce_mean((q1 - backup)**2)
            q2_loss = tf.reduce_mean((q2 - backup)**2)
            q_loss = q1_loss + q2_loss

            if debug_mode and self.cur_tensorboard_step % 1 == 0:
                tensorboard_step = int(self.cur_tensorboard_step / 1)
                with self.summary.as_default():
                    tf.summary.scalar("loss_q1", q1_loss, tensorboard_step)
                    tf.summary.scalar("loss_q2", q2_loss, tensorboard_step)
                    tf.summary.scalar("loss_q", q_loss, tensorboard_step)
                    tf.summary.histogram("q1", q1, tensorboard_step)
                    tf.summary.histogram("q2", q2, tensorboard_step)
                    tf.summary.histogram("pi_targ", pi_targ, tensorboard_step)
                    tf.summary.histogram("pi_a2", a2, tensorboard_step)

        train_vars = self.actor_critic.q1.trainable_variables + \
               self.actor_critic.q2.trainable_variables + \
               self.actor_critic.actor.emd.trainable_variables
        # train_vars = self.actor_critic.actor.emd.trainable_variables

        q_gradient = tape.gradient(q_loss, train_vars)
        self.opti_q.apply_gradients(zip(q_gradient, train_vars))

    def train_p(self, batch):

        with tf.GradientTape() as tape:
            img1 = batch['obs1']["img"]
            speed1 = batch['obs1']["speed"]
            direction1 = batch['obs1']["direction"]
            direction1 = tf.stack([tf.range(self.batch_size), direction1],
                                  axis=1)  # [None, 2]

            pi_list, z = self.actor_critic.select_action([img1, speed1])
            pi_list = tf.stack(pi_list[0:4], axis=1)  # [None, 4, 3]
            pi = tf.gather_nd(pi_list, direction1)  # [None, 3]

            q1_pi_list = self.actor_critic.work_q1(z, pi)
            # q1_pi_list, _ = self.actor_critic([img1, speed1, pi])
            q1_pi_list = tf.stack(q1_pi_list, axis=1)
            q1_pi = tf.gather_nd(q1_pi_list, direction1)

            pi_loss = -tf.reduce_mean(q1_pi)

        train_vars_pi = self.actor_critic.actor.trainable_variables
        pi_gradient = tape.gradient(pi_loss, train_vars_pi)
        train_pi = [
            var for (var, gra) in zip(train_vars_pi, pi_gradient)
            if gra is not None
        ]
        pi_gra = [
            gra for (var, gra) in zip(train_vars_pi, pi_gradient)
            if gra is not None
        ]
        self.opti_pi.apply_gradients(zip(pi_gra, train_pi))
        self.target_update(self.target_actor_critic, self.actor_critic)

        if debug_mode and self.cur_tensorboard_step % 1 == 0:
            tensorboard_step = int(self.cur_tensorboard_step / 1)
            with self.summary.as_default():
                tf.summary.histogram("pi", pi, tensorboard_step)
                tf.summary.histogram("q1_pi", q1_pi, tensorboard_step)
                tf.summary.scalar("loss_pi", pi_loss, tensorboard_step)

    def run_one_phrase(self, min_step, eval_mode=False):
        step = 0
        episode = 0
        reward = 0.

        while step < min_step:
            done = False
            obs = self.env.reset()

            step_episode = 0
            reward_episode = 0
            while not done:

                if self.cur_train_step > self.start_steps or eval_mode or True:
                    a = self.get_action(obs, self.act_noise, eval_mode)

                else:
                    a = self.env.action_space.sample()

                # print(a)

                obs_, r, done, _ = self.env.step(a)

                step += 1
                step_episode += 1
                reward += r
                reward_episode += r

                if not eval_mode:
                    self.cur_train_step += 1
                    self.replay_buffer.add(obs, a, obs_, [r], [done])

                if step_episode >= self.max_ep_len:
                    break
                obs = obs_

            episode += 1
            if self.cur_train_step > self.start_steps and not eval_mode:
                for j in range(step_episode):
                    batch = self.replay_buffer.sample(self.batch_size)
                    self.cur_tensorboard_step += 1

                    self.train_q(batch)

                    if j % self.policy_delay == 0:
                        self.train_p(batch)

            if episode % 20 == 0 and not eval_mode:
                self.manager.save()

            print("ep:", episode, "step:", step_episode, "r:", reward_episode)
            if not eval_mode:
                self.logger.store(step=step_episode, reward=reward_episode)
            else:
                self.logger.store(step_test=step_episode,
                                  reward_test=reward_episode)

        return reward, episode

    def train_test(self):
        for i in range(self.iteration):
            print("iter:", i + 1)
            self.logger.store(iter=i + 1)
            reward, episode = self.run_one_phrase(self.train_step)
            # print("reward:", reward/episode, "episode:", episode)
            # tf.logging.info("reward: %.2f, episode: %.2f", reward/episode, episode)

            reward, episode = self.run_one_phrase(self.evaluation_step, True)
            # print("reward:", reward / episode, "episode:", episode)
            # tf.logging.info("reward_test: %.2f, episode_test: %.2f", reward/episode, episode)

            self.logger.log_tabular("reward", with_min_and_max=True)
            self.logger.log_tabular("step", with_min_and_max=True)
            self.logger.log_tabular("reward_test", with_min_and_max=True)
            self.logger.log_tabular("step_test", with_min_and_max=True)
            # self.logger.log_tabular('Q1Vals', with_min_and_max=True)
            # self.logger.log_tabular('Q2Vals', with_min_and_max=True)
            # self.logger.log_tabular('LossPi', average_only=True)
            # self.logger.log_tabular('LossQ', average_only=True)
            self.logger.dump_tabular()
Пример #17
0
def policyg(env_fn,
            actor_critic=ActorCritic,
            ac_kwargs=dict(),
            seed=0,
            episodes_per_epoch=40,
            epochs=500,
            gamma=0.99,
            lam=0.97,
            pi_lr=3e-4,
            vf_lr=1e-3,
            train_v_iters=80,
            max_ep_len=1000,
            logger_kwargs=dict(),
            save_freq=10):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    ac_kwargs['action_space'] = env.action_space

    # Models
    ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs)

    # Buffers
    local_episodes_per_epoch = int(episodes_per_epoch / num_procs())
    buff = BufferA(obs_dim[0], act_dim[0], local_episodes_per_epoch,
                   max_ep_len)

    # Count variables
    var_counts = tuple(
        count_vars(module) for module in [ac.policy, ac.value_f])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Optimizers
    train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr)
    train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr)

    # Parameters Sync
    sync_all_params(ac.parameters())

    def update(e):
        obs, act, adv, ret, lgp_old = [
            torch.Tensor(x) for x in buff.retrieve_all()
        ]

        # Policy
        _, lgp, _ = ac.policy(obs, act)
        entropy = (-lgp).mean()

        # Policy loss
        # policy gradient term + entropy term
        pi_loss = -(lgp * adv).mean()

        # Train policy
        train_pi.zero_grad()
        pi_loss.backward()
        average_gradients(train_pi.param_groups)
        train_pi.step()

        # Value function
        v = ac.value_f(obs)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            v = ac.value_f(obs)
            v_loss = F.mse_loss(v, ret)

            # Value function train
            train_v.zero_grad()
            v_loss.backward()
            average_gradients(train_v.param_groups)
            train_v.step()

        # Log the changes
        _, lgp, _, v = ac(obs, act)
        entropy_new = (-lgp).mean()
        pi_loss_new = -(lgp * adv).mean()
        v_loss_new = F.mse_loss(v, ret)
        kl = (lgp_old - lgp).mean()
        logger.store(LossPi=pi_loss,
                     LossV=v_l_old,
                     DeltaLossPi=(pi_loss_new - pi_loss),
                     DeltaLossV=(v_loss_new - v_l_old),
                     Entropy=entropy,
                     KL=kl)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_t = 0

    for epoch in range(epochs):
        ac.eval()
        # Policy rollout
        for _ in range(local_episodes_per_epoch):
            for _ in range(max_ep_len):
                obs = torch.Tensor(o.reshape(1, -1))
                a, _, lopg_t, v_t = ac(obs)

                buff.store(o,
                           a.detach().numpy(), r, v_t.item(),
                           lopg_t.detach().numpy())
                logger.store(VVals=v_t)

                o, r, d, _ = env.step(a.detach().numpy()[0])
                ep_ret += r
                ep_len += 1
                total_t += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal:
                    buff.end_episode()
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger._torch_save(ac, fname="expert_torch_save.pt")

        # Update
        ac.train()

        update(epoch)

        # Log
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', total_t)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #18
0
    parser.add_argument('--batch', default=50)
    parser.add_argument('--norm_state', default=True)
    parser.add_argument('--norm_rewards', default=True)
    parser.add_argument('--is_clip_v', default=True)
    parser.add_argument('--max_grad_norm', default=-1, type=float)
    parser.add_argument('--anneal_lr', default=False)
    parser.add_argument('--debug', default=False)
    parser.add_argument('--log_every', default=10)
    args = parser.parse_args()

    device = torch.device(
        "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")

    from utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    logger = EpochLogger(**logger_kwargs)
    writer = SummaryWriter(os.path.join(logger.output_dir, "logs"))

    env = gym.make(args.env)
    if args.env_num > 1:
        env = [
            Env(args.env,
                norm_state=args.norm_state,
                norm_rewards=args.norm_rewards) for _ in range(args.env_num)
        ]
        env = SubVectorEnv(env)
    # env = CarRacing()
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape
    action_max = env.action_space.high[0]
    ppo = core.PPO(state_dim,
Пример #19
0
    parser.add_argument('--is_gae', action="store_true")
    parser.add_argument('--last_v', action="store_true")
    parser.add_argument('--max_grad_norm', default=-1, type=float)
    parser.add_argument('--anneal_lr', action="store_true")
    parser.add_argument('--debug', action="store_false")
    parser.add_argument('--log_every', default=10, type=int)
    parser.add_argument('--target_kl', default=0.03, type=float)
    parser.add_argument('--test_epoch', default=10, type=int)
    args = parser.parse_args()

    device = torch.device(
        "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")

    from utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    logger = EpochLogger(**logger_kwargs)
    writer = SummaryWriter(os.path.join(logger.output_dir, "logs"))
    with open(os.path.join(logger.output_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    env = make_atari(args.env)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = wrap_deepmind(env, frame_stack=True)
    env = ImageToPyTorch(env)
    # test_env = make_atari(args.env)
    # test_env = gym.wrappers.RecordEpisodeStatistics(test_env)
    # test_env = wrap_deepmind(test_env, frame_stack=True)
    # test_env = ImageToPyTorch(test_env)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
Пример #20
0
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 batch_size=32,
                 train_step=25000,
                 evaluation_step=3000,
                 max_ep_len=6000,
                 epsilon_train=0.1,
                 epsilon_eval=0.01,
                 replay_size=100000,
                 epsilon_decay_period=25000,
                 warmup_steps=2000,
                 iteration=200,
                 gamma=0.99,
                 target_update_period=800,
                 update_period=4,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=True,
                            port=port,
                            gpu=gpu)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session("", config=config)
        set_session(self.sess)

        self.build_model()
        self.cur_train_step = 0

        self.observation_shape = (84, 84)
        self.state_shape = (1, ) + self.observation_shape + (4, )
        self.s = np.zeros(self.state_shape)
        self.last_s = np.zeros(self.state_shape)

        if debug_mode:
            self.summary = tf.summary.FileWriter(
                os.path.join(self.logger.output_dir, "logs"))

        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32,
                                shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32,
                                       shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32,
                                       shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        tf.summary.histogram("q", self.q)
        # tf.summary.histogram("q_target", self.q_target)
        # tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()
Пример #21
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.
            
        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)

        # Target Q-values, using action from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    min_q_targ = tf.minimum(q1_targ, q2_targ)
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.reduce_mean((q1 - backup)**2)
    q2_loss = tf.reduce_mean((q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #22
0
def gail(env_fn,
         actor_critic=ActorCritic,
         ac_kwargs=dict(),
         disc=Discriminator,
         dc_kwargs=dict(),
         seed=0,
         episodes_per_epoch=40,
         epochs=500,
         gamma=0.99,
         lam=0.97,
         pi_lr=3e-3,
         vf_lr=3e-3,
         dc_lr=5e-4,
         train_v_iters=80,
         train_dc_iters=80,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10):

    l_lam = 0  # balance two loss term

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    ac_kwargs['action_space'] = env.action_space

    # Models
    ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs)
    disc = disc(input_dim=obs_dim[0], **dc_kwargs)

    # TODO: Load expert policy here
    expert = actor_critic(input_dim=obs_dim[0], **ac_kwargs)
    expert_name = "expert_torch_save.pt"
    expert = torch.load(osp.join(logger_kwargs['output_dir'], expert_name))

    # Buffers
    local_episodes_per_epoch = int(episodes_per_epoch / num_procs())
    buff_s = BufferS(obs_dim[0], act_dim[0], local_episodes_per_epoch,
                     max_ep_len)
    buff_t = BufferT(obs_dim[0], act_dim[0], local_episodes_per_epoch,
                     max_ep_len)

    # Count variables
    var_counts = tuple(
        count_vars(module) for module in [ac.policy, ac.value_f, disc.policy])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n' %
               var_counts)

    # Optimizers
    train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr)
    train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr)
    train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr)

    # Parameters Sync
    sync_all_params(ac.parameters())
    sync_all_params(disc.parameters())

    def update(e):
        obs_s, act, adv, ret, lgp_old = [
            torch.Tensor(x) for x in buff_s.retrieve_all()
        ]
        obs_t, _ = [torch.Tensor(x) for x in buff_t.retrieve_all()]

        # Policy
        _, lgp, _ = ac.policy(obs_s, act)
        entropy = (-lgp).mean()

        # Policy loss
        # policy gradient term + entropy term
        pi_loss = -(lgp * adv).mean() - l_lam * entropy

        # Train policy
        if e > 10:
            train_pi.zero_grad()
            pi_loss.backward()
            average_gradients(train_pi.param_groups)
            train_pi.step()

        # Value function
        v = ac.value_f(obs_s)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            v = ac.value_f(obs_s)
            v_loss = F.mse_loss(v, ret)

            # Value function train
            train_v.zero_grad()
            v_loss.backward()
            average_gradients(train_v.param_groups)
            train_v.step()

        # Discriminator
        gt1 = torch.ones(obs_s.size()[0], dtype=torch.int)
        gt2 = torch.zeros(obs_t.size()[0], dtype=torch.int)
        _, lgp_s, _ = disc(obs_s, gt=gt1)
        _, lgp_t, _ = disc(obs_t, gt=gt2)
        dc_loss_old = -lgp_s.mean() - lgp_t.mean()
        for _ in range(train_dc_iters):
            _, lgp_s, _ = disc(obs_s, gt=gt1)
            _, lgp_t, _ = disc(obs_t, gt=gt2)
            dc_loss = -lgp_s.mean() - lgp_t.mean()

            # Discriminator train
            train_dc.zero_grad()
            dc_loss.backward()
            average_gradients(train_dc.param_groups)
            train_dc.step()

        _, lgp_s, _ = disc(obs_s, gt=gt1)
        _, lgp_t, _ = disc(obs_t, gt=gt2)
        dc_loss_new = -lgp_s.mean() - lgp_t.mean()

        # Log the changes
        _, lgp, _, v = ac(obs, act)
        entropy_new = (-lgp).mean()
        pi_loss_new = -(lgp * adv).mean() - l_lam * entropy
        v_loss_new = F.mse_loss(v, ret)
        kl = (lgp_old - lgp).mean()
        logger.store(LossPi=pi_loss,
                     LossV=v_l_old,
                     LossDC=dc_loss_old,
                     DeltaLossPi=(pi_loss_new - pi_loss),
                     DeltaLossV=(v_loss_new - v_l_old),
                     DeltaLossDC=(dc_loss_new - dc_loss_old),
                     DeltaEnt=(entropy_new - entropy),
                     Entropy=entropy,
                     KL=kl)

    start_time = time.time()
    o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset(), 0, 0, False, 0, 0, 0
    total_t = 0

    ep_len_t = 0
    for epoch in range(epochs):
        ac.eval()
        disc.eval()
        # We recognize the probability term of index [0] correspond to the teacher's policy
        # Student's policy rollout
        for _ in range(local_episodes_per_epoch):
            for _ in range(max_ep_len):
                obs = torch.Tensor(o.reshape(1, -1))
                a, _, lopg_t, v_t = ac(obs)

                buff_s.store(o,
                             a.detach().numpy(), r, sdr, v_t.item(),
                             lopg_t.detach().numpy())
                logger.store(VVals=v_t)

                o, r, d, _ = env.step(a.detach().numpy()[0])
                _, sdr, _ = disc(torch.Tensor(o.reshape(1, -1)),
                                 gt=torch.Tensor([0]))
                if sdr < -4:  # Truncate rewards
                    sdr = -4
                ep_ret += r
                ep_sdr += sdr
                ep_len += 1
                total_t += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal:
                    buff_s.end_episode()
                    logger.store(EpRetS=ep_ret, EpLenS=ep_len, EpSdrS=ep_sdr)
                    o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset(
                    ), 0, 0, False, 0, 0, 0

        # Teacher's policy rollout
        for _ in range(local_episodes_per_epoch):
            for _ in range(max_ep_len):
                obs = torch.Tensor(o.reshape(1, -1))
                a, _, _, _ = expert(obs)

                buff_t.store(o, a.detach().numpy(), r)

                o, r, d, _ = env.step(a.detach().numpy()[0])
                ep_ret += r
                ep_len += 1
                total_t += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal:
                    buff_t.end_episode()
                    logger.store(EpRetT=ep_ret, EpLenT=ep_len)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, [ac, disc], None)

        # Update
        ac.train()
        disc.train()

        update(epoch)

        # Log
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRetS', with_min_and_max=True)
        logger.log_tabular('EpSdrS', with_min_and_max=True)
        logger.log_tabular('EpLenS', average_only=True)
        logger.log_tabular('EpRetT', with_min_and_max=True)
        logger.log_tabular('EpLenT', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', total_t)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('LossDC', average_only=True)
        logger.log_tabular('DeltaLossDC', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('DeltaEnt', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #23
0
class Dqn:
    def __init__(self,
                 env_name,
                 port=2000,
                 gpu=0,
                 batch_size=32,
                 train_step=25000,
                 evaluation_step=3000,
                 max_ep_len=6000,
                 epsilon_train=0.1,
                 epsilon_eval=0.01,
                 replay_size=100000,
                 epsilon_decay_period=25000,
                 warmup_steps=2000,
                 iteration=200,
                 gamma=0.99,
                 target_update_period=800,
                 update_period=4,
                 logger_kwargs=dict()):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.env = CarlaEnv(early_termination_enabled=True,
                            run_offscreen=True,
                            port=port,
                            gpu=gpu)

        self.train_step = train_step
        self.evaluation_step = evaluation_step
        self.max_ep_len = max_ep_len
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.batch_size = batch_size
        self.replay_size = replay_size
        self.epsilon_decay_period = epsilon_decay_period
        self.warmup_steps = warmup_steps
        self.iteration = iteration
        self.replay_buffer = ReplayBuffer(replay_size)
        self.gamma = gamma
        self.target_update_period = target_update_period
        self.update_period = update_period

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session("", config=config)
        set_session(self.sess)

        self.build_model()
        self.cur_train_step = 0

        self.observation_shape = (84, 84)
        self.state_shape = (1, ) + self.observation_shape + (4, )
        self.s = np.zeros(self.state_shape)
        self.last_s = np.zeros(self.state_shape)

        if debug_mode:
            self.summary = tf.summary.FileWriter(
                os.path.join(self.logger.output_dir, "logs"))

        self.loss = tf.placeholder(tf.float32, shape=[])
        self.q = tf.placeholder(tf.float32,
                                shape=[None, self.env.action_space.n])
        self.q_target = tf.placeholder(tf.float32,
                                       shape=[None, self.env.action_space.n])
        self.target_q = tf.placeholder(tf.float32,
                                       shape=[None, self.env.action_space.n])
        tf.summary.scalar("loss", self.loss)
        tf.summary.histogram("q", self.q)
        # tf.summary.histogram("q_target", self.q_target)
        # tf.summary.histogram("target_q", self.target_q)
        self.merge = tf.summary.merge_all()

    def build_model(self):
        self.model, self.model_target = nature_dqn(self.env.action_space.n,
                                                   (80, 80, 6))
        self.model_target.set_weights(self.model.get_weights())

    def choose_action(self, s, eval_mode=False):
        epsilon = self.epsilon_eval if eval_mode \
            else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train)

        if random.random() <= 1 - epsilon:
            q = self.model.predict(s[np.newaxis, :])
            a = np.argmax(q, axis=1)[0]
            # print()
        else:
            a = self.env.action_space.sample()

        return a

    def record_obs(self, observation):
        self.last_s = copy.copy(self.s)
        self.s = np.roll(self.s, -1, axis=-1)
        self.s[0, ..., -1] = np.squeeze(observation)

    def store(self, s, a, s_, r, done):
        pass

    def run_one_phrase(self, min_step, eval_mode=False):
        step = 0
        episode = 0
        reward = 0.

        while step < min_step:
            done = False
            obs = self.env.reset()

            step_episode = 0
            reward_episode = 0
            while not done:

                s = np.array(obs)
                a = self.choose_action(s, eval_mode)

                obs_, r, done, _ = self.env.step(a)

                step += 1
                step_episode += 1
                reward += r
                reward_episode += r

                if not eval_mode:
                    self.cur_train_step += 1
                    self.replay_buffer.add(obs, a, obs_, r, done)

                    if self.cur_train_step > 2000:
                        if self.cur_train_step % self.update_period == 0:
                            # data = self.replay_buffer.sample()
                            (s, a, s_, r,
                             d) = self.replay_buffer.sample(self.batch_size)
                            q_ = np.max(self.model_target.predict(s_), axis=1)
                            q_target = r + (1 - d) * self.gamma * q_
                            q = self.model.predict(s)
                            q_recoder = np.copy(q)

                            batch_index = np.arange(self.batch_size)
                            q[batch_index, a] = q_target
                            result = self.model.train_on_batch(np.array(s), q)
                            # print("result:", result)

                            # if self.cur_train_step%1== 0:
                            #     merge = self.sess.run(self.merge, feed_dict={self.loss: result[0], self.q: q_recoder})
                            #     self.summary.add_summary(merge, (self.cur_train_step-20000)/self.update_period/1)

                        if self.cur_train_step % self.target_update_period == 0:
                            self.model_target.set_weights(
                                self.model.get_weights())

                if step_episode >= self.max_ep_len:
                    break
                obs = obs_

            episode += 1

            savepath = os.path.join(self.logger.output_dir, "saver")
            if not os.path.exists(savepath):
                os.makedirs(savepath)
            self.model.save(
                os.path.join(savepath, "model" + str(episode % 5) + ".h5"))
            # info = psutil.virtual_memory()

            # sys.stdout.write("steps: {}".format(step) + " episode_length: {}".format(step_episode) +
            #                  " return: {}".format(reward_episode) +
            #                  "  memory used : {}".format(psutil.Process(os.getpid()).memory_info().rss) +
            #                  " total memory: {}\r".format(info.total))
            #
            # sys.stdout.flush()

            print("ep:", episode, "step:", step, "r:", reward)
            if not eval_mode:
                self.logger.store(step=step_episode, reward=reward_episode)
            else:
                self.logger.store(step_test=step_episode,
                                  reward_test=reward_episode)

        return reward, episode

    def train_test(self):
        for i in range(self.iteration):
            print("iter:", i + 1)
            self.logger.store(iter=i + 1)
            reward, episode = self.run_one_phrase(self.train_step)
            # print("reward:", reward/episode, "episode:", episode)
            tf.logging.info("reward: %.2f, episode: %.2f", reward / episode,
                            episode)

            reward, episode = self.run_one_phrase(self.evaluation_step, True)
            # print("reward:", reward / episode, "episode:", episode)
            tf.logging.info("reward_test: %.2f, episode_test: %.2f",
                            reward / episode, episode)

            self.logger.log_tabular("reward", with_min_and_max=True)
            self.logger.log_tabular("step", with_min_and_max=True)
            self.logger.log_tabular("reward_test", with_min_and_max=True)
            self.logger.log_tabular("step_test", with_min_and_max=True)
            self.logger.dump_tabular()
Пример #24
0
            dynamic_model.fit(use_data_buf=True, normalize=True)
            cost_model.fit()
    env.close()

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--robot', type=str, default='point', help="robot model, selected from `point` or `car` ")
    parser.add_argument('--level', type=int, default=1, help="environment difficulty, selected from `1` or `2`, where `2` would be more difficult than `1`")
    parser.add_argument('--epoch', type=int, default=60, help="maximum epochs to train")
    parser.add_argument('--episode', type=int, default=10, help="determines how many episodes data to collect for each epoch")
    parser.add_argument('--render','-r', action='store_true', help="render the environment")
    parser.add_argument('--test', '-t', action='store_true', help="test the performance of pretrained models without training")

    parser.add_argument('--seed', '-s', type=int, default=1, help="seed for Gym, PyTorch and Numpy")
    parser.add_argument('--dir', '-d',type=str, default='./data/', help="directory to save the logging information")
    parser.add_argument('--name','-n', type=str, default='test', help="name of the experiment, used to save data in a folder named by this parameter")
    parser.add_argument('--save', action='store_true', help="save the trained dynamic model, data buffer, and cost model")
    parser.add_argument('--load',type=str, default=None, help="load the trained dynamic model, data buffer, and cost model from a specified directory")
    parser.add_argument('--ensemble',type=int, default=0, help="number of model ensembles, if this argument is greater than 0, then it will replace the default ensembles number in config.yml") # number of ensembles
    parser.add_argument('--optimizer','-o',type=str, default="rce", help=" determine the optimizer, selected from `rce`, `cem`, or `random` ") # random, cem or CCE
    parser.add_argument('--config', '-c', type=str, default='./config.yml', help="specify the path to the configuation file of the models")

    args = parser.parse_args()
    logger_kwargs = setup_logger_kwargs(args.name, args.seed, args.dir)
    logger = EpochLogger(**logger_kwargs)
    config = load_config(args.config)

    run(logger, config, args)
    
Пример #25
0
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping),

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v``
            module. The ``step`` method should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing
                                           | the log probability, according to
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical:
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`.

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    if inspect.isclass(actor_critic):
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    else:
        ac = actor_critic
    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)  # current state
            logger.save_state({'env': env}, epoch)  # for rendering

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

    logger.output_file.close()
Пример #26
0
    parser.add_argument('--log', type=str, default="logs")
    parser.add_argument('--steps', default=300)
    parser.add_argument('--port', default=2000)
    parser.add_argument('--gpu', default=0)
    parser.add_argument('--exp_name', default="ppo_carla")
    parser.add_argument('--seed', default=0)
    parser.add_argument('--batch', default=50)
    args = parser.parse_args()

    gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    from utils.run_utils import setup_logger_kwargs
    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    logger = EpochLogger(**logger_kwargs)
    # logger.save_config(locals())

    env = CarlaEnv(early_termination_enabled=True,
                   run_offscreen=False,
                   port=args.port,
                   gpu=args.gpu,
                   discrete_control=False)
    ppo = core.PPO(3, 0.2, lr_a=args.lr_a, lr_c=args.lr_c)

    if debug_mode:
        summary = tf.summary.create_file_writer(
            os.path.join(logger.output_dir, "logs"))

    savepath = osp.join(logger.output_dir, "saver")
    checkpoint = tf.train.Checkpoint(model=ppo)
 def logger_setup(self, logger_kwargs, **kwargs):
     self.logger = EpochLogger(**logger_kwargs)
     for key, value in kwargs.items():
         if key != 'env' and key != 'output_dir':
             self.logger.log_tabular(key, value)
Пример #28
0
def run_policy(env,
               get_action,
               ckpt_num,
               max_con,
               con,
               max_ep_len=100,
               num_episodes=100,
               fpath=None,
               render=False,
               record=True,
               video_caption_off=False):

    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    output_dir = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),
                          "log/tmp/experiments/%i" % int(time.time()))
    logger = EpochLogger(output_dir=output_dir)
    o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
    visual_obs = []
    c_onehot = F.one_hot(torch.tensor(con), max_con).squeeze().float()
    while n < num_episodes:

        vob = render_frame(env,
                           ep_len,
                           ep_ret,
                           'AC',
                           render,
                           record,
                           caption_off=video_caption_off)
        visual_obs.append(vob)
        concat_obs = torch.cat(
            [torch.Tensor(o.reshape(1, -1)),
             c_onehot.reshape(1, -1)], 1)
        a = get_action(concat_obs)
        o, r, d, _ = env.step(a[0].detach().numpy()[0])
        ep_ret += r
        ep_len += 1
        d = False

        if d or (ep_len == max_ep_len):
            vob = render_frame(env,
                               ep_len,
                               ep_ret,
                               'AC',
                               render,
                               record,
                               caption_off=video_caption_off)
            visual_obs.append(vob)  # add last frame
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len))
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            n += 1

    logger.log_tabular('EpRet', with_min_and_max=True)
    logger.log_tabular('EpLen', average_only=True)
    logger.dump_tabular()
    if record:
        # temp_info: [video_prefix, ckpt_num, ep_ret, ep_len, con]
        temp_info = ['', ckpt_num, ep_ret, ep_len, con]
        logger.save_video(visual_obs, temp_info, fpath)
Пример #29
0
    def __init__(self, env_fn, actor_critic=core.MLPActorCritic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=100,
                 epochs=10000,
                 replay_size=int(2000000),
                 gamma=0.99,
                 polyak=0.995,
                 lr=3e-4,
                 p_lr=3e-4,
                 alpha=0.0,
                 batch_size=1024,
                 start_steps=10000,
                 update_after=0,
                 update_every=50,
                 num_test_episodes=10,
                 max_ep_len=1000,
                 logger_kwargs=dict(),
                 save_freq=1,
                 algo='SAC'):
        """
        Soft Actor-Critic (SAC)


        Args:
            env_fn : A function which creates a copy of the environment.
                The environment must satisfy the OpenAI Gym API.

            actor_critic: The constructor method for a PyTorch Module with an ``act`` 
                method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
                The ``act`` method and ``pi`` module should accept batches of 
                observations as inputs, and ``q1`` and ``q2`` should accept a batch 
                of observations and a batch of actions as inputs. When called, 
                ``act``, ``q1``, and ``q2`` should return:

                ===========  ================  ======================================
                Call         Output Shape      Description
                ===========  ================  ======================================
                ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                            | observation.
                ``q1``       (batch,)          | Tensor containing one current estimate
                                            | of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ``q2``       (batch,)          | Tensor containing the other current 
                                            | estimate of Q* for the provided observations
                                            | and actions. (Critical: make sure to
                                            | flatten this!)
                ===========  ================  ======================================

                Calling ``pi`` should return:

                ===========  ================  ======================================
                Symbol       Shape             Description
                ===========  ================  ======================================
                ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                            | given observations.
                ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                            | actions in ``a``. Importantly: gradients
                                            | should be able to flow back into ``a``.
                ===========  ================  ======================================

            ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
                you provided to SAC.

            seed (int): Seed for random number generators.

            steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
                for the agent and the environment in each epoch.

            epochs (int): Number of epochs to run and train agent.

            replay_size (int): Maximum length of replay buffer.

            gamma (float): Discount factor. (Always between 0 and 1.)

            polyak (float): Interpolation factor in polyak averaging for target 
                networks. Target networks are updated towards main networks 
                according to:

                .. math:: \\theta_{\\text{targ}} \\leftarrow 
                    \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

                where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
                close to 1.)

            lr (float): Learning rate (used for both policy and value learning).

            alpha (float): Entropy regularization coefficient. (Equivalent to 
                inverse of reward scale in the original SAC paper.)

            batch_size (int): Minibatch size for SGD.

            start_steps (int): Number of steps for uniform-random action selection,
                before running real policy. Helps exploration.

            update_after (int): Number of env interactions to collect before
                starting to do gradient descent updates. Ensures replay buffer
                is full enough for useful updates.

            update_every (int): Number of env interactions that should elapse
                between gradient descent updates. Note: Regardless of how long 
                you wait between updates, the ratio of env steps to gradient steps 
                is locked to 1.

            num_test_episodes (int): Number of episodes to test the deterministic
                policy at the end of each epoch.

            max_ep_len (int): Maximum length of trajectory / episode / rollout.

            logger_kwargs (dict): Keyword args for EpochLogger.

            save_freq (int): How often (in terms of gap between epochs) to save
                the current policy and value function.

            """

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env, self.test_env = env_fn(), env_fn()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.env.action_space.high[0]

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space, self.env.action_space,
                               special_policy='awac', **ac_kwargs)
        self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space,
                                    special_policy='awac', **ac_kwargs)
        self.ac_targ.load_state_dict(self.ac.state_dict())
        self.gamma = gamma

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters())

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim,
                                          size=replay_size)

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)
        self.algo = algo

        self.p_lr = p_lr
        self.lr = lr
        self.alpha = 0
        # # Algorithm specific hyperparams

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.num_test_episodes = num_test_episodes
        self.max_ep_len = max_ep_len
        self.epochs = epochs
        self.steps_per_epoch = steps_per_epoch
        self.update_after = update_after
        self.update_every = update_every
        self.batch_size = batch_size
        self.save_freq = save_freq
        self.polyak = polyak
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)
        print("Running Offline RL algorithm: {}".format(self.algo))
Пример #30
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            o2, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()