예제 #1
0
파일: sql.py 프로젝트: jinparksj/RL_NABI
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        # evaluation_env = deep_clone(env) if self._eval_n_episodes else None
        # if self.high_lv_control:
        #     evaluation_env = env
        # else:
        evaluation_env = deep_clone(env) if self._eval_n_episodes else None
        # TODO: use Ezpickle to deep_clone???

        with tf.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                # Added to render
                # if self._eval_render:
                #     from schema.utils.sampler_utils import rollout
                #     rollout(self.env, self.policy, max_path_length=1000, animated=True)

            self.sampler.terminate()
예제 #2
0
파일: sql.py 프로젝트: jinparksj/RL_NABI
    def _evaluate(self, policy, evaluation_env):
        """Perform evaluation for the current policy."""

        if self._eval_n_episodes < 1:
            return

        # TODO: max_path_length should be a property of environment.
        # input = None if self.high_lv_control else self._action_dim
        paths = rollouts(evaluation_env, policy, self.sampler._max_path_length,
                         self._eval_n_episodes, input)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        # TODO: figure out how to pass log_diagnostics through
        evaluation_env.log_diagnostics(paths)
        if self._eval_render:
            evaluation_env.render(paths)

        if self.sampler.batch_ready():
            batch = self.sampler.random_batch()
            self.log_diagnostics(batch)
    def train(self, envs):
        self.training_step = 0
        best_reward = 0
        visited_rooms = set()
        eplen = 0

        rollout_idx = 0
        state = np.transpose(envs.reset(), (0, 3, 1, 2))

        # rollout
        while rollout_idx < self.num_rollouts:
            states = np.zeros((self.num_steps, self.num_envs, 1, 84, 84),
                              np.float32)
            actions = np.zeros((self.num_steps, self.num_envs), np.int32)
            action_log_probs = np.zeros((self.num_steps, self.num_envs),
                                        np.float32)
            rewards = np.zeros((self.num_steps, self.num_envs), np.float32)
            next_states = np.zeros((self.num_steps, self.num_envs, 1, 84, 84),
                                   np.float32)
            dones = np.zeros((self.num_steps, self.num_envs), np.int32)

            current_best_reward = 0
            hidden = None

            for t in range(self.num_steps):
                action, action_log_prob, hidden = self.select_action(
                    state, hidden)
                next_state, reward, done, info = envs.step(action)
                # TensorFlow format to PyTorch
                next_state = np.transpose(next_state, (0, 3, 1, 2))

                # transitions
                states[t, ...] = state
                actions[t, ...] = action
                action_log_probs[t, ...] = action_log_prob
                rewards[t, ...] = reward
                next_states[t, ...] = next_state
                dones[t, ...] = done

                if self.render:
                    envs.render(0)
                state = next_state

                # done
                for i, dne in enumerate(done):
                    if dne:
                        epinfo = info[i]['episode']
                        if 'visited_rooms' in epinfo:
                            visited_rooms |= epinfo['visited_rooms']

                        best_reward = max(epinfo['r'], best_reward)
                        current_best_reward = max(epinfo['r'],
                                                  current_best_reward)
                        eplen += epinfo['l']

            # logger
            logger.info('GAME STATUS')
            logger.record_tabular('rollout_idx', rollout_idx)
            logger.record_tabular(
                'visited_rooms',
                str(len(visited_rooms)) + ', ' + str(visited_rooms))
            logger.record_tabular('best_reward', best_reward)
            logger.record_tabular('current_best_reward', current_best_reward)
            logger.record_tabular('eplen', eplen)
            logger.dump_tabular()

            # train neural networks
            self.update_parameters(states, actions, action_log_probs, rewards,
                                   next_states, dones)
            rollout_idx += 1
    def update_parameters(self, states, actions, action_log_probs, rewards,
                          next_states, dones):
        # T * B * features
        states = torch.from_numpy(states).to(dtype=torch.float32,
                                             device=self.device)
        actions = torch.from_numpy(actions).to(dtype=torch.int32,
                                               device=self.device)
        old_action_log_probs = torch.from_numpy(action_log_probs).to(
            dtype=torch.float32, device=self.device)
        rewards = torch.from_numpy(rewards).to(dtype=torch.float32,
                                               device=self.device)
        next_states = torch.from_numpy(next_states).to(dtype=torch.float32,
                                                       device=self.device)
        masks = 1 - torch.from_numpy(dones).to(dtype=torch.float32,
                                               device=self.device)

        # GENERALIZED ADVANTAGE ESTIMATION
        with torch.no_grad():
            advantages = torch.zeros_like(rewards)
            _, values, _ = self.actor_critic(
                torch.cat([states, next_states[-1].unsqueeze(0)], dim=0))
            values = values.squeeze(2)  # remove last dimension

            last_gae_lam = 0
            for t in range(self.num_steps - 1, -1, -1):
                delta = rewards[t] + masks[t] * \
                    self.gamma * values[t + 1] - values[t]
                advantages[t, :] = delta + masks[t] * \
                    self.lamda * self.gamma * last_gae_lam
                last_gae_lam = advantages[t]

            returns = advantages + values[:-1]

        logger.info('GENERALIZED ADVANTAGE ESTIMATION')
        logger.record_tabular('advantages mean', advantages.mean(dim=(0, 1)))
        logger.record_tabular('advantages std', advantages.std(dim=(0, 1)))
        logger.record_tabular('returns mean', returns.mean(dim=(0, 1)))
        logger.record_tabular('returns std', returns.std(dim=(0, 1)))
        logger.dump_tabular()

        # train epochs
        for epoch_idx in range(self.update_epochs):
            self.training_step += 1
            # sample (T * B * features)
            slic = random.sample(list(range(self.num_envs)), self.sample_envs)

            state = states[:, slic, ...].contiguous()
            action = actions[:, slic, ...]
            old_action_log_prob = old_action_log_probs[:, slic, ...]
            retur = returns[:, slic, ...]
            advantage = advantages[:, slic, ...]

            # policy loss
            dist, value, _ = self.actor_critic(state)
            action_log_prob = dist.log_prob(action)

            ratio = torch.exp(action_log_prob - old_action_log_prob)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - self.clip_range,
                                1.0 + self.clip_range) * advantage
            action_loss = -torch.mean(torch.min(surr1, surr2), dim=(0, 1))

            # value loss
            smooth_l1_loss = nn.SmoothL1Loss(reduction='mean')
            value_loss = smooth_l1_loss(retur.flatten(), value.flatten())

            # entropy loss
            entropy_loss = -torch.mean(dist.entropy(), dim=(0, 1))

            # backprop
            loss = action_loss + value_loss + self.coeff_ent * entropy_loss
            self.optimizer.zero_grad()
            loss.backward()

            if self.max_grad_norm > 1e-8:
                nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                         self.max_grad_norm)

            self.optimizer.step()

            if self.training_step % 10000 == 0:
                self.save_param(self.saved_path)

        logger.info('UPDATE')
        logger.record_tabular('training_step', self.training_step)
        logger.record_tabular('value_loss', value_loss.item())
        logger.record_tabular('policy_loss', action_loss.item())
        logger.record_tabular('entropy_loss', entropy_loss.item())
        logger.dump_tabular()
예제 #5
0
파일: acktr.py 프로젝트: qqadssp/ACKTR
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')

    X_v, vtarg_n_v, loss2, loss_sampled2 = vf.update_info
    optim2 = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                async=0, kfac_update=2, cold_iter=50, \
                                weight_decay_dict=vf.wd_dict, max_grad_norm=None)
    vf_var_list = []
    for var in tf.trainable_variables():
        if "vf" in var.name:
            vf_var_list.append(var)
    update_op2 = optim2.minimize(loss2, loss_sampled2, var_list=vf_var_list)

    ob_p, oldac_p, adv_p, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=0, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)
    update_op = optim.minimize(loss, loss_sampled, var_list=pi_var_list)

    sess = tf.get_default_session()
    sess.run(tf.variables_initializer(set(tf.global_variables())))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = discount(delta_t, gamma * lam)
            advs.append(adv_t)

        # Update value function
        paths_ = []
        for p in paths:
            l = pathlength(p)
            act = p["action_dist"].astype('float32')
            paths_.append(
                np.concatenate([p['observation'], act,
                                np.ones((l, 1))],
                               axis=1))
        X1 = np.concatenate(paths_)
        y = np.concatenate(vtargs)
        logger.record_tabular("EVBefore",
                              explained_variance(vf._predict(X1), y))
        #        for _ in range(20):
        #            sess.run(update_op2, {X_v:X1, vtarg_n_v:y}) #do_update2(X, y)
        logger.record_tabular("EVAfter",
                              explained_variance(vf._predict(X1), y))

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        sess.run(update_op, {
            ob_p: ob_no,
            oldac_p: action_na,
            adv_p: standardized_adv_n
        })

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)

        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1
예제 #6
0
    def train(self, env):
        # Memory
        memory = ReplayBuffer(capacity=self.replay_size)

        # Training Loop
        total_numsteps = 0
        updates = 0

        for i_episode in itertools.count(1):
            episode_reward = 0
            episode_steps = 0
            done = False
            state = env.reset()

            while not done:
                if total_numsteps < self.start_steps:
                    action = env.action_space.sample()  # Sample random action
                else:
                    # Sample action from policy
                    action = self.select_action(state)

                if len(memory) > self.batch_size:
                    # Number of updates per step in environment
                    for i in range(self.updates_per_step):
                        # Update parameters of all the networks
                        q1_loss, q2_loss, policy_loss, alpha_loss = self.update_parameters(
                            memory, self.batch_size, updates)
                        updates += 1

                next_state, reward, done, _ = env.step(action)  # Step
                episode_steps += 1
                total_numsteps += 1
                episode_reward += reward

                if self.render:
                    env.render()

                # Ignore the "done" signal if it comes from hitting the time horizon.
                # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
                done = 0 if episode_steps == env._max_episode_steps else done

                memory.push(state, action, reward, next_state,
                            done)  # Append transition to memory

                state = next_state

            logger.info('UPDATE')
            logger.record_tabular('q1_loss', q1_loss)
            logger.record_tabular('q2_loss', q2_loss)
            logger.record_tabular('policy_loss', policy_loss)
            logger.record_tabular('alpha_loss', alpha_loss)
            logger.dump_tabular()

            logger.info('STATUS')
            logger.record_tabular('i_episode', i_episode)
            logger.record_tabular('episode_steps', episode_steps)
            logger.record_tabular('total_numsteps', total_numsteps)
            logger.record_tabular('episode_reward', episode_reward)
            logger.dump_tabular()

            if i_episode % 100 == 0:
                logger.info('SAVE')
                self.save_model('../saved/sac')

            if total_numsteps > self.num_steps:
                return
예제 #7
0
    def train(self, envs):

        self.training_step = 0
        best_reward = torch.zeros((1,), device=self.device)
        eplen = torch.zeros((1,), device=self.device, dtype=torch.int32)
        visited_rooms = set()

        rollout_idx = 0
        state = np.transpose(envs.reset(), (0, 3, 1, 2))

        # rollout
        while rollout_idx < self.num_rollouts:
            # sync model
            distributed_util.sync_model(self.actor_critic)

            states = np.zeros(
                (self.num_steps, self.num_envs, 1, 84, 84), np.float32)
            actions = np.zeros((self.num_steps, self.num_envs), np.int32)
            action_log_probs = np.zeros(
                (self.num_steps, self.num_envs), np.float32)
            rewards = np.zeros((self.num_steps, self.num_envs), np.float32)
            next_states = np.zeros(
                (self.num_steps, self.num_envs, 1, 84, 84), np.float32)
            dones = np.zeros((self.num_steps, self.num_envs), np.int32)

            current_best_reward = torch.zeros((1,), device=self.device)
            hidden = None

            for t in range(self.num_steps):
                action, action_log_prob, hidden = self.select_action(
                    state, hidden)
                next_state, reward, done, info = envs.step(action)
                # TensorFlow format to PyTorch
                next_state = np.transpose(next_state, (0, 3, 1, 2))

                # transitions
                states[t, ...] = state
                actions[t, ...] = action
                action_log_probs[t, ...] = action_log_prob
                rewards[t, ...] = reward
                next_states[t, ...] = next_state
                dones[t, ...] = done

                if self.render:
                    envs.render(0)
                state = next_state

                # done
                for i, dne in enumerate(done):
                    if dne:
                        epinfo = info[i]['episode']
                        if 'visited_rooms' in epinfo:
                            visited_rooms |= epinfo['visited_rooms']

                        best_reward[0] = max(epinfo['r'], best_reward[0])
                        current_best_reward[0] = max(
                            epinfo['r'], current_best_reward[0])
                        eplen[0] += epinfo['l']

            # logger
            dist.all_reduce(best_reward, op=dist.ReduceOp.MAX)
            dist.all_reduce(current_best_reward, op=dist.ReduceOp.MAX)
            # TODO: sync visited_rooms

            if self.rank == 0:
                logger.info('GAME STATUS')
                logger.record_tabular('rollout_idx', rollout_idx)
                logger.record_tabular('visited_rooms',
                                      str(len(visited_rooms)) + ', ' + str(visited_rooms))
                logger.record_tabular('best_reward', best_reward.item())
                logger.record_tabular(
                    'current_best_reward', current_best_reward.item())
                logger.record_tabular(
                    'eplen', eplen.item() * dist.get_world_size())
                logger.dump_tabular()

            # train neural networks
            self.update_parameters(states, actions, action_log_probs,
                                   rewards, next_states, dones)
            rollout_idx += 1