Exemplo n.º 1
0
def setup_logger(algo=None, dirname=None, exp_prefix='exp', log_base_dir=LOG_BASE_DIR):
    reset_logger()
    exp_uuid = str(uuid.uuid4())
    if dirname is None:
        dirname = generate_exp_name(exp_prefix=exp_prefix, exp_id=exp_uuid, log_base_dir=log_base_dir)
    rllablogger.set_snapshot_dir(dirname)
    dirname = rllablogger.get_snapshot_dir()
    rllablogger.add_tabular_output(os.path.join(dirname, 'progress.csv'))
    if algo:
        with open(os.path.join(dirname, 'params.json'), 'w') as f:
            params = extract_hyperparams(algo)
            params['uuid'] = exp_uuid
            json.dump(params, f)
    yield dirname
    rllablogger.remove_tabular_output(os.path.join(dirname, 'progress.csv'))
Exemplo n.º 2
0
def save_exception():
    exc_file = os.path.join(rllablogger.get_snapshot_dir(), 'exception.txt')
    with open(exc_file, 'w') as f:
        traceback.print_exc(file=f)
    traceback.print_exc()
Exemplo n.º 3
0
    def train(self):
        start_time = time.time()
        last_time = start_time

        # Evaluate untrained policy
        total_timesteps = 0
        timesteps_since_train = 0
        timesteps_since_eval = 0
        timesteps_since_reset = 0

        iteration = 0
        running_loss = None
        running_validation_loss = None

        if logger.get_snapshot_dir() and self.log_tensorboard:
            self.summary_writer = SummaryWriter(
                osp.join(logger.get_snapshot_dir(), 'tensorboard'))

        # Evaluation Code
        self.policy.eval()
        self.evaluate_policy(self.eval_episodes,
                             total_timesteps=0,
                             greedy=True,
                             prefix='Eval')
        logger.record_tabular('policy loss', 0)
        logger.record_tabular('timesteps', total_timesteps)
        logger.record_tabular('epoch time (s)', time.time() - last_time)
        logger.record_tabular('total time (s)', time.time() - start_time)
        last_time = time.time()
        logger.dump_tabular()
        # End Evaluation Code

        with tqdm.tqdm(total=self.eval_freq, smoothing=0) as ranger:
            while total_timesteps < self.max_timesteps:

                # Interact in environmenta according to exploration strategy.
                if total_timesteps < self.explore_timesteps:
                    states, actions, goal_state = self.sample_trajectory(
                        noise=1)
                else:
                    states, actions, goal_state = self.sample_trajectory(
                        greedy=True, noise=self.expl_noise)

                # With some probability, put this new trajectory into the validation buffer
                if self.validation_buffer is not None and np.random.rand(
                ) < 0.2:
                    self.validation_buffer.add_trajectory(
                        states, actions, goal_state)
                else:
                    self.replay_buffer.add_trajectory(states, actions,
                                                      goal_state)

                total_timesteps += self.max_path_length
                timesteps_since_train += self.max_path_length
                timesteps_since_eval += self.max_path_length

                ranger.update(self.max_path_length)

                # Take training steps
                if timesteps_since_train >= self.train_policy_freq and total_timesteps > self.start_policy_timesteps:
                    timesteps_since_train %= self.train_policy_freq
                    self.policy.train()
                    for _ in range(
                            int(self.policy_updates_per_step *
                                self.train_policy_freq)):
                        loss = self.take_policy_step()
                        validation_loss = self.validation_loss()
                        if running_loss is None:
                            running_loss = loss
                        else:
                            running_loss = 0.9 * running_loss + 0.1 * loss

                        if running_validation_loss is None:
                            running_validation_loss = validation_loss
                        else:
                            running_validation_loss = 0.9 * running_validation_loss + 0.1 * validation_loss

                    self.policy.eval()
                    ranger.set_description(
                        'Loss: %s Validation Loss: %s' %
                        (running_loss, running_validation_loss))

                    if self.summary_writer:
                        self.summary_writer.add_scalar('Losses/Train',
                                                       running_loss,
                                                       total_timesteps)
                        self.summary_writer.add_scalar(
                            'Losses/Validation', running_validation_loss,
                            total_timesteps)

                # Evaluate, log, and save to disk
                if timesteps_since_eval >= self.eval_freq:
                    timesteps_since_eval %= self.eval_freq
                    iteration += 1
                    # Evaluation Code
                    self.policy.eval()
                    self.evaluate_policy(self.eval_episodes,
                                         total_timesteps=total_timesteps,
                                         greedy=True,
                                         prefix='Eval')
                    logger.record_tabular('policy loss', running_loss
                                          or 0)  # Handling None case
                    logger.record_tabular('timesteps', total_timesteps)
                    logger.record_tabular('epoch time (s)',
                                          time.time() - last_time)
                    logger.record_tabular('total time (s)',
                                          time.time() - start_time)
                    last_time = time.time()
                    logger.dump_tabular()

                    # Logging Code
                    if logger.get_snapshot_dir():
                        modifier = str(
                            iteration) if self.save_every_iteration else ''
                        torch.save(
                            self.policy.state_dict(),
                            osp.join(logger.get_snapshot_dir(),
                                     'policy%s.pkl' % modifier))
                        if hasattr(self.replay_buffer, 'state_dict'):
                            with open(
                                    osp.join(logger.get_snapshot_dir(),
                                             'buffer%s.pkl' % modifier),
                                    'wb') as f:
                                pickle.dump(self.replay_buffer.state_dict(), f)

                        full_dict = dict(env=self.env, policy=self.policy)
                        with open(
                                osp.join(logger.get_snapshot_dir(),
                                         'params%s.pkl' % modifier),
                                'wb') as f:
                            pickle.dump(full_dict, f)

                    ranger.reset()