示例#1
0
 def train(self):
     with tf.Session() as sess:
         sess.run(tf.initialize_all_variables())
         self.start_worker()
         start_time = time.time()
         for itr in range(self.start_itr, self.n_itr):
             itr_start_time = time.time()
             with logger.prefix('itr #%d | ' % itr):
                 logger.log("Obtaining samples...")
                 paths = self.obtain_samples(itr)
                 logger.log("Processing samples...")
                 samples_data = self.process_samples(itr, paths)
                 logger.log("Logging diagnostics...")
                 self.log_diagnostics(paths)
                 logger.log("Optimizing policy...")
                 self.optimize_policy(itr, samples_data)
                 logger.log("Saving snapshot...")
                 params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
                 if self.store_paths:
                     params["paths"] = samples_data["paths"]
                 logger.save_itr_params(itr, params)
                 logger.log("Saved")
                 logger.record_tabular('Time', time.time() - start_time)
                 logger.record_tabular('ItrTime', time.time() - itr_start_time)
                 logger.dump_tabular(with_prefix=False)
                 if self.plot:
                     self.update_plot()
                     if self.pause_for_plot:
                         input("Plotting evaluation run: Press Enter to "
                               "continue...")
     self.shutdown_worker()
示例#2
0
 def optimize_policy(self, itr, samples_data):
     all_input_values = tuple(ext.extract(
         samples_data,
         "observations", "actions", "advantages"
     ))
     agent_infos = samples_data["agent_infos"]
     state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
     dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
     all_input_values += tuple(state_info_list) + tuple(dist_info_list)
     if self.policy.recurrent:
         all_input_values += (samples_data["valids"],)
     logger.log("Computing loss before")
     loss_before = self.optimizer.loss(all_input_values)
     logger.log("Computing KL before")
     mean_kl_before = self.optimizer.constraint_val(all_input_values)
     logger.log("Optimizing")
     self.optimizer.optimize(all_input_values)
     logger.log("Computing KL after")
     mean_kl = self.optimizer.constraint_val(all_input_values)
     logger.log("Computing loss after")
     loss_after = self.optimizer.loss(all_input_values)
     logger.record_tabular('LossBefore', loss_before)
     logger.record_tabular('LossAfter', loss_after)
     logger.record_tabular('MeanKLBefore', mean_kl_before)
     logger.record_tabular('MeanKL', mean_kl)
     logger.record_tabular('dLoss', loss_before - loss_after)
     return dict()
    def compute_irl(self, paths, itr=0):
        if self.no_reward:
            tot_rew = 0
            for path in paths:
                tot_rew += np.sum(path['rewards'])
                path['rewards'] *= 0
            logger.record_tabular('OriginalTaskAverageReturn', tot_rew/float(len(paths)))

        if self.irl_model_wt <=0:
            return paths

        if self.train_irl:
            max_itrs = self.discrim_train_itrs
            lr=1e-3
            mean_loss = self.irl_model.fit(paths, policy=self.policy, itr=itr, max_itrs=max_itrs, lr=lr,
                                           logger=logger)

            logger.record_tabular('IRLLoss', mean_loss)
            self.__irl_params = self.irl_model.get_params()

        probs = self.irl_model.eval(paths, gamma=self.discount, itr=itr)

        logger.record_tabular('IRLRewardMean', np.mean(probs))
        logger.record_tabular('IRLRewardMax', np.max(probs))
        logger.record_tabular('IRLRewardMin', np.min(probs))


        if self.irl_model.score_trajectories:
            # TODO: should I add to reward here or after advantage computation?
            for i, path in enumerate(paths):
                path['rewards'][-1] += self.irl_model_wt * probs[i]
        else:
            for i, path in enumerate(paths):
                path['rewards'] += self.irl_model_wt * probs[i]
        return paths
示例#4
0
    def log_diagnostics(self, paths):
        #Ntraj = len(paths)
        #acts = np.array([traj['actions'] for traj in paths])
        obs = np.array([np.sum(traj['observations'], axis=0) for traj in paths])

        state_count = np.sum(obs, axis=0)
        #state_count = np.mean(state_count, axis=0)
        state_freq = state_count/float(np.sum(state_count))
        for state in range(self.nstates):
            logger.record_tabular('AvgStateFreq%d'%state, state_freq[state])
示例#5
0
    def log_diagnostics(self, paths):
        n_goal = len(self.goal_positions)
        goal_reached = [False] * n_goal

        for path in paths:
            last_obs = path["observations"][-1]
            for i, goal in enumerate(self.goal_positions):
                if np.linalg.norm(last_obs - goal) < self.goal_threshold:
                    goal_reached[i] = True

        logger.record_tabular('env:goals_reached', goal_reached.count(True))
示例#6
0
 def fit(self, xs, ys):
     sess = tf.get_default_session()
     if self._normalize_inputs:
         # recompute normalizing constants for inputs
         sess.run([
             tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)),
             tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8),
         ])
     if self._normalize_outputs:
         # recompute normalizing constants for outputs
         sess.run([
             tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)),
             tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8),
         ])
     if self._use_trust_region:
         old_means, old_log_stds = self._f_pdists(xs)
         inputs = [xs, ys, old_means, old_log_stds]
     else:
         inputs = [xs, ys]
     loss_before = self._optimizer.loss(inputs)
     if self._name:
         prefix = self._name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     self._optimizer.optimize(inputs)
     loss_after = self._optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     if self._use_trust_region:
         logger.record_tabular(prefix + 'MeanKL', self._optimizer.constraint_val(inputs))
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
 def log_diagnostics(self, paths, prefix=''):
     progs = [
         path["observations"][-1][-4] - path["observations"][0][-4]
         for path in paths
     ]
     logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs))
     logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs))
     logger.record_tabular(prefix+'MinForwardProgress', np.min(progs))
     logger.record_tabular(prefix+'StdForwardProgress', np.std(progs))
示例#8
0
    def optimize_policy(self, itr, all_samples_data):
        assert len(all_samples_data) == self.num_grad_updates + 1  # we collected the rollouts to compute the grads and then the test!

        if not self.use_maml:
            all_samples_data = [all_samples_data[0]]

        input_list = []
        for step in range(len(all_samples_data)):  # these are the gradient steps
            obs_list, action_list, adv_list = [], [], []
            for i in range(self.meta_batch_size):

                inputs = ext.extract(
                    all_samples_data[step][i],
                    "observations", "actions", "advantages"
                )
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                adv_list.append(inputs[2])
            input_list += obs_list + action_list + adv_list  # [ [obs_0], [act_0], [adv_0], [obs_1], ... ]

            if step == 0:  ##CF not used?
                init_inputs = input_list

        if self.use_maml:
            dist_info_list = []
            for i in range(self.meta_batch_size):
                agent_infos = all_samples_data[self.kl_constrain_step][i]['agent_infos']
                dist_info_list += [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
            input_list += tuple(dist_info_list)
            logger.log("Computing KL before")
            mean_kl_before = self.optimizer.constraint_val(input_list)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(input_list)
        logger.log("Optimizing")
        self.optimizer.optimize(input_list)
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(input_list)
        if self.use_maml:
            logger.log("Computing KL after")
            mean_kl = self.optimizer.constraint_val(input_list)
            logger.record_tabular('MeanKLBefore', mean_kl_before)  # this now won't be 0!
            logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
示例#9
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values()
        if hasattr(self.algo.env,"get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args)!=np.ndarray:
            reset_args = [reset_args]*self.n_envs
        if self.algo.policy.all_param_vals:
            cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals]
        else:
            cur_policy_params = [cur_policy_params]*self.n_envs
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        for i in range(self.n_envs):
            paths[i] = parallel_sampler.sample_paths(
                policy_params=cur_policy_params[i],
                env_params=cur_env_params,
                max_samples=self.algo.batch_size / self.n_envs,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args[i],
                show_prog_bar=False,
            )
        total_time = time.time() - start
        logger.record_tabular(log_prefix+"TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())

        self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
示例#10
0
    def train(self):
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        if self.init_pol_params is not None:
            self.policy.set_param_values(self.init_pol_params)
        if self.init_irl_params is not None:
            self.irl_model.set_params(self.init_irl_params)
        self.start_worker()
        start_time = time.time()

        returns = []
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)

                logger.log("Processing samples...")
                paths = self.compute_irl(paths, itr=itr)
                returns.append(self.log_avg_returns(paths))
                samples_data = self.process_samples(itr, paths)

                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
        self.shutdown_worker()
        return 
示例#11
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths
     # we need to log the purely gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [sum(path['env_infos']['outer_rew']) for path in paths]
         logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
         #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys() and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean([np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn', wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(stripped_paths)  # see swimmer_env.py for a scketch of the maze plotting!
示例#12
0
 def fit(self, xs, ys):
     if self.normalize_inputs:
         # recompute normalizing constants for inputs
         new_mean = np.mean(xs, axis=0, keepdims=True)
         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
         tf.get_default_session().run(tf.group(
             tf.assign(self.x_mean_var, new_mean),
             tf.assign(self.x_std_var, new_std),
         ))
     if self.use_trust_region and self.first_optimized:
         old_prob = self.f_prob(xs)
         inputs = [xs, ys, old_prob]
         optimizer = self.tr_optimizer
     else:
         inputs = [xs, ys]
         optimizer = self.optimizer
     loss_before = optimizer.loss(inputs)
     if self.name:
         prefix = self.name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     optimizer.optimize(inputs)
     loss_after = optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
     self.first_optimized = True
示例#13
0
def custom_train(algo, sess=None):
    """
    This is necessary so that we don't wipe away already initialized policy params.
    Ideally, we should pull request this in as an option to RLlab and remove it from here once done
    """
    created_session = True if (sess is None) else False
    if sess is None:
        sess = tf.Session()
        sess.__enter__()

    rollout_cache = []
    initialize_uninitialized(sess)
    algo.start_worker()
    start_time = time.time()
    for itr in range(algo.start_itr, algo.n_itr):
        itr_start_time = time.time()
        with logger.prefix('itr #%d | ' % itr):
            logger.log("Obtaining samples...")
            paths = algo.obtain_samples(itr)
            logger.log("Processing samples...")
            samples_data = algo.process_samples(itr, paths)
            logger.log("Logging diagnostics...")
            algo.log_diagnostics(paths)
            logger.log("Optimizing policy...")
            algo.optimize_policy(itr, samples_data)
            logger.log("Saving snapshot...")
            params = algo.get_itr_snapshot(itr, samples_data)  # , **kwargs)
            if algo.store_paths:
                params["paths"] = samples_data["paths"]
            logger.save_itr_params(itr, params)
            logger.log("Saved")
            logger.record_tabular('Time', time.time() - start_time)
            logger.record_tabular('ItrTime', time.time() - itr_start_time)
            logger.dump_tabular(with_prefix=False)

    algo.shutdown_worker()
    if created_session:
        sess.close()
示例#14
0
 def train(self, sess=None):
     created_session = True if (sess is None) else False
     if sess is None:
         sess = tf.Session()
         sess.__enter__()
         
     sess.run(tf.global_variables_initializer())
     self.start_worker()
     start_time = time.time()
     for itr in range(self.start_itr, self.n_itr):
         itr_start_time = time.time()
         with logger.prefix('itr #%d | ' % itr):
             logger.log("Obtaining samples...")
             paths = self.obtain_samples(itr)
             logger.log("Processing samples...")
             samples_data = self.process_samples(itr, paths)
             logger.log("Logging diagnostics...")
             self.log_diagnostics(paths)
             logger.log("Optimizing policy...")
             self.optimize_policy(itr, samples_data)
             logger.log("Saving snapshot...")
             params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
             if self.store_paths:
                 params["paths"] = samples_data["paths"]
             logger.save_itr_params(itr, params)
             logger.log("Saved")
             logger.record_tabular('Time', time.time() - start_time)
             logger.record_tabular('ItrTime', time.time() - itr_start_time)
             logger.dump_tabular(with_prefix=False)
             if self.plot:
                 rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length)
                 if self.pause_for_plot:
                     input("Plotting evaluation run: Press Enter to "
                           "continue...")
     self.shutdown_worker()
     if created_session:
         sess.close()
    def fit(self, xs, ys):

        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(0, num_samples_tot, int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self._x_mean_var.set_value(
                np.mean(xs, axis=0, keepdims=True).astype(theano.config.floatX))
            self._x_std_var.set_value(
                (np.std(xs, axis=0, keepdims=True) + 1e-8).astype(theano.config.floatX))
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            self._y_mean_var.set_value(
                np.mean(ys, axis=0, keepdims=True).astype(theano.config.floatX))
            self._y_std_var.set_value(
                (np.std(ys, axis=0, keepdims=True) + 1e-8).astype(theano.config.floatX))
        if self._name:
            prefix = self._name + "_"
        else:
            prefix = ""
        # FIXME: needs batch computation to avoid OOM.
        loss_before, loss_after, mean_kl, batch_count = 0., 0., 0., 0
        for batch in iterate_minibatches_generic(input_lst=[xs, ys], batchsize=self._batchsize, shuffle=True):
            batch_count += 1
            xs, ys = batch
            if self._use_trust_region:
                old_means, old_log_stds = self._f_pdists(xs)
                inputs = [xs, ys, old_means, old_log_stds]
            else:
                inputs = [xs, ys]
            loss_before += self._optimizer.loss(inputs)

            self._optimizer.optimize(inputs)
            loss_after += self._optimizer.loss(inputs)
            if self._use_trust_region:
                mean_kl += self._optimizer.constraint_val(inputs)

        logger.record_tabular(prefix + 'LossBefore', loss_before / batch_count)
        logger.record_tabular(prefix + 'LossAfter', loss_after / batch_count)
        logger.record_tabular(prefix + 'dLoss', loss_before - loss_after / batch_count)
        if self._use_trust_region:
            logger.record_tabular(prefix + 'MeanKL', mean_kl / batch_count)
示例#16
0
文件: vpg_expl.py 项目: jpdoyle/vime
    def optimize_policy(self, itr, samples_data):
        logger.log("optimizing policy")
        inputs = ext.extract(
            samples_data,
            "observations", "actions", "advantages"
        )
        if self.policy.recurrent:
            inputs += (samples_data["valids"],)
        agent_infos = samples_data["agent_infos"]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        loss_before = self.optimizer.loss(inputs)
        self.optimizer.optimize(inputs)
        loss_after = self.optimizer.loss(inputs)
        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)

        mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('MaxKL', max_kl)
示例#17
0
    def log_diagnostics(self, paths, prefix=''):
        progs = [
            path["observations"][-1][-3] - path["observations"][0][-3]
            for path in paths
        ]
        #if np.mean(progs) > 4.5:
        #    import pdb; pdb.set_trace()
        #path = paths[0]
        #t = -10
        #lb, ub = self.action_bounds
        #scaling = (ub - lb) * 0.5
        #rew = path['rewards'][t]
        #act = path['actions'][t]
        #ctrl_cost = 0.5*self.ctrl_cost_coeff*np.sum(np.square(act/scaling))

        logger.record_tabular('AverageForwardProgress', np.mean(progs))
        logger.record_tabular('MaxForwardProgress', np.max(progs))
        logger.record_tabular('MinForwardProgress', np.min(progs))
        logger.record_tabular('StdForwardProgress', np.std(progs))
 def fit(self, xs, ys):
     if self._normalize_inputs:
         # recompute normalizing constants for inputs
         self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True))
         self._x_std_var.set_value(np.std(xs, axis=0, keepdims=True) + 1e-8)
     if self._use_trust_region:
         old_prob = self._f_prob(xs)
         inputs = [xs, ys, old_prob]
     else:
         inputs = [xs, ys]
     loss_before = self._optimizer.loss(inputs)
     if self._name:
         prefix = self._name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     self._optimizer.optimize(inputs)
     loss_after = self._optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
示例#19
0
    def log_diagnostics(self, batch):
        """Record diagnostic information.

        Records the mean and standard deviation of Q-function and the
        squared Bellman residual of the  s (mean squared Bellman error)
        for a sample batch.

        Also call the `draw` method of the plotter, if plotter is defined.
        """

        feeds = self._get_feed_dict(batch)
        qf, bellman_residual = self._sess.run(
            [self._q_values, self._bellman_residual], feeds)

        logger.record_tabular('qf-avg', np.mean(qf))
        logger.record_tabular('qf-std', np.std(qf))
        logger.record_tabular('mean-sq-bellman-error', bellman_residual)

        self.policy.log_diagnostics(batch)
        if self.plotter:
            self.plotter.draw()
示例#20
0
 def log_diagnostics(self, paths):
     log_stds = np.vstack([path["agent_infos"]["log_std"] for path in paths])
     logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))
示例#21
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/michael/"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # create Alice

    env_alice = AliceEnv(env_alice=env,
                         env_bob=env,
                         policy_bob=policy,
                         max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'],
                         alice_bonus=v['alice_bonus'],
                         gamma=1,
                         stop_threshold=v['stop_threshold'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    if v["baseline"] == "MLP":
        baseline_alice = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline_alice = LinearFeatureBaseline(env_spec=env.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir,
                     'good_all_feasible_starts.pkl'), 'rb'))
    logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log("Labeling the starts")
        [starts, labels] = label_states_from_paths(
            trpo_paths,
            n_traj=v['n_traj'],
            key='goal_reached',  # using the min n_traj
            as_goal=False,
            env=env)
        # labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        if len(
                filtered_raw_starts
        ) == 0:  # add a tone of noise if all the states I had ended up being high_reward!
            logger.log("Bad Alice!  All goals are high reward!")

        all_starts.append(filtered_raw_starts)

        # Useful plotting and metrics (basic test set)
        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_"):
            unif_starts = all_feasible_starts.sample(100)
            mean_reward, paths = evaluate_states(unif_starts,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=v['n_traj'],
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(unif_starts,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            # report.add_text("Success: " + str(np.mean(mean_reward)))

        with logger.tabular_prefix("Fixed_"):
            mean_reward, paths = evaluate_states(init_pos,
                                                 env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=5,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            env.log_diagnostics(paths)
            mean_rewards = mean_reward.reshape(-1, 1)
            labels = compute_labels(
                mean_rewards,
                old_rewards=old_rewards,
                min_reward=min_reward,
                max_reward=max_reward,
                improvement_threshold=improvement_threshold)
            logger.log("Starts labelled")
            plot_labeled_states(init_pos,
                                labels,
                                report=report,
                                itr=outer_iter,
                                limit=v['goal_range'],
                                center=v['goal_center'],
                                maze_id=v['maze_id'],
                                summary_string_base='initial starts labels:\n')
            report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

        report.new_row()
        report.save()
        logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward))
        logger.dump_tabular()
示例#22
0
    def train(self):
        with tf.Session() as sess:
            if self.load_policy is not None:
                import joblib
                self.policy = joblib.load(self.load_policy)['policy']
            self.init_opt()
            # initialize uninitialized vars (I know, it's ugly)
            uninit_vars = []
            for var in tf.all_variables():
                try:
                    sess.run(var)
                except tf.errors.FailedPreconditionError:
                    uninit_vars.append(var)
            sess.run(tf.initialize_variables(uninit_vars))
            #sess.run(tf.initialize_all_variables())
            self.start_worker()
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):

                    logger.log("Obtaining samples...")
                    paths = self.obtain_samples(itr)
                    logger.log("Processing samples...")
                    samples_data = self.process_samples(itr, paths)
                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths)
                    logger.log("Optimizing policy...")
                    self.optimize_policy(itr, samples_data)
                    #new_param_values = self.policy.get_variable_values(self.policy.all_params)

                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(itr,
                                                   samples_data)  # , **kwargs)
                    if self.store_paths:
                        params["paths"] = samples_data["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)

                    #import pickle
                    #with open('paths_itr'+str(itr)+'.pkl', 'wb') as f:
                    #    pickle.dump(paths, f)

                    # debugging
                    """
                    if itr % 1 == 0:
                        logger.log("Saving visualization of paths")
                        import matplotlib.pyplot as plt;
                        for ind in range(5):
                            plt.clf(); plt.hold(True)
                            points = paths[ind]['observations']
                            plt.plot(points[:,0], points[:,1], '-r', linewidth=2)
                            plt.xlim([-1.0, 1.0])
                            plt.ylim([-1.0, 1.0])
                            plt.legend(['path'])
                            plt.savefig('/home/cfinn/path'+str(ind)+'.png')
                    """
                    # end debugging

                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        self.update_plot()
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")
        self.shutdown_worker()
    def train(self):
        # TODO - make this a util
        flatten_list = lambda l: [item for sublist in l for item in sublist]

        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = self.frac_gpu

        with tf.Session(config=config) as sess:
            # Code for loading a previous policy. Somewhat hacky because needs to be in sess.
            if self.load_policy is not None:
                import joblib
                self.policy = joblib.load(self.load_policy)['policy']
            self.init_opt()
            self.initialize_uninitialized_variables(sess)

            self.all_paths = []

            self.start_worker()
            start_time = time.time()
            n_env_timesteps = 0

            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):

                    logger.record_tabular("mean_inner_stepsize",
                                          self.policy.get_mean_step_size())
                    ''' sample environment configuration '''
                    env = self.env
                    while not ('sample_env_params' in dir(env)
                               or 'sample_goals' in dir(env)):
                        env = env._wrapped_env
                    if 'sample_goals' in dir(env):
                        learner_env_params = env.sample_goals(
                            self.meta_batch_size)
                    elif 'sample_env_params':
                        learner_env_params = env.sample_env_params(
                            self.meta_batch_size)
                    ''' get rollouts from the environment'''

                    time_env_sampling_start = time.time()

                    if self.initial_random_samples and itr == 0:
                        logger.log(
                            "Obtaining random samples from the environment...")
                        new_env_paths = self.obtain_random_samples(itr,
                                                                   log=True)

                        n_env_timesteps += self.initial_random_samples
                        logger.record_tabular("n_timesteps", n_env_timesteps)

                        self.all_paths.extend(new_env_paths)
                        samples_data_dynamics = self.random_sampler.process_samples(
                            itr,
                            self.all_paths,
                            log=True,
                            log_prefix='EnvTrajs-'
                        )  # must log in the same way as the model sampler below

                    else:
                        if self.reset_policy_std:
                            logger.log("Resetting policy std")
                            self.policy.set_std()

                        if not self.tailored_exploration:
                            logger.log(
                                "Disabling tailored exploration. Using pre-update policy to collect samples."
                            )
                            self.policy.switch_to_init_dist()

                        logger.log(
                            "Obtaining samples from the environment using the policy..."
                        )
                        new_env_paths = self.obtain_env_samples(
                            itr,
                            reset_args=learner_env_params,
                            log_prefix='EnvSampler-')
                        n_env_timesteps += self.batch_size
                        logger.record_tabular("n_timesteps", n_env_timesteps)

                        # flatten dict of paths per task/mode --> list of paths
                        new_env_paths = [
                            path for task_paths in new_env_paths.values()
                            for path in task_paths
                        ]
                        # self.all_paths.extend(new_env_paths)
                        logger.log("Processing environment samples...")
                        # first processing just for logging purposes
                        self.model_sampler.process_samples(
                            itr,
                            new_env_paths,
                            log=True,
                            log_prefix='EnvTrajs-')

                        new_samples_data_dynamics = self.process_samples_for_dynamics(
                            itr, new_env_paths)
                        for k, v in samples_data_dynamics.items():
                            samples_data_dynamics[k] = np.concatenate(
                                [v, new_samples_data_dynamics[k]],
                                axis=0)[-int(self.dynamics_data_buffer_size):]

                    logger.record_tabular(
                        'Time-EnvSampling',
                        time.time() - time_env_sampling_start)

                    if self.log_real_performance:
                        logger.log(
                            "Evaluating the performance of the real policy")
                        self.policy.switch_to_init_dist()
                        new_env_paths = self.obtain_env_samples(
                            itr,
                            reset_args=learner_env_params,
                            log_prefix='PrePolicy-')
                        samples_data = {}
                        for key in new_env_paths.keys():
                            samples_data[
                                key] = self.process_samples_for_policy(
                                    itr, new_env_paths[key], log=False)
                        _ = self.process_samples_for_policy(
                            itr,
                            flatten_list(new_env_paths.values()),
                            log_prefix='PrePolicy-')
                        self.policy.compute_updated_dists(samples_data)
                        new_env_paths = self.obtain_env_samples(
                            itr,
                            reset_args=learner_env_params,
                            log_prefix='PostPolicy-',
                        )
                        _ = self.process_samples_for_policy(
                            itr,
                            flatten_list(new_env_paths.values()),
                            log_prefix='PostPolicy-')
                    ''' --------------- fit dynamics model --------------- '''

                    time_fit_start = time.time()

                    epochs = self.dynamic_model_epochs[min(
                        itr,
                        len(self.dynamic_model_epochs) - 1)]
                    if self.reinit_model and itr % self.reinit_model == 0:
                        self.dynamics_model.reinit_model()
                        epochs = self.dynamic_model_epochs[0]
                    logger.log("Training dynamics model for %i epochs ..." %
                               (epochs))
                    self.dynamics_model.fit(
                        samples_data_dynamics['observations_dynamics'],
                        samples_data_dynamics['actions_dynamics'],
                        samples_data_dynamics['next_observations_dynamics'],
                        epochs=epochs,
                        verbose=True,
                        log_tabular=True)

                    logger.record_tabular('Time-ModelFit',
                                          time.time() - time_fit_start)
                    ''' --------------- MAML steps --------------- '''

                    times_dyn_sampling = []
                    times_dyn_sample_processing = []
                    times_inner_step = []
                    times_outer_step = []

                    time_maml_steps_start = time.time()

                    for maml_itr in range(self.num_maml_steps_per_iter):

                        self.policy.switch_to_init_dist(
                        )  # Switch to pre-update policy

                        all_samples_data_maml_iter, all_paths_maml_iter = [], []
                        for step in range(self.num_grad_updates + 1):
                            ''' --------------- Sampling from Dynamics Models --------------- '''

                            logger.log(
                                "MAML Step %i%s of %i - Obtaining samples from the dynamics model..."
                                % (maml_itr + 1, chr(97 + step),
                                   self.num_maml_steps_per_iter))

                            time_dyn_sampling_start = time.time()

                            if self.reset_from_env_traj:
                                new_model_paths = self.obtain_model_samples(
                                    itr,
                                    traj_starting_obs=samples_data_dynamics[
                                        'observations_dynamics'],
                                    traj_starting_ts=samples_data_dynamics[
                                        'timesteps_dynamics'])
                            else:
                                new_model_paths = self.obtain_model_samples(
                                    itr)

                            assert type(new_model_paths) == dict and len(
                                new_model_paths) == self.meta_batch_size
                            all_paths_maml_iter.append(new_model_paths)

                            times_dyn_sampling.append(time.time() -
                                                      time_dyn_sampling_start)
                            ''' --------------- Processing Dynamics Samples --------------- '''

                            logger.log("Processing samples...")
                            time_dyn_sample_processing_start = time.time()
                            samples_data = {}

                            for key in new_model_paths.keys(
                            ):  # the keys are the tasks
                                # don't log because this will spam the consol with every task.
                                samples_data[
                                    key] = self.process_samples_for_policy(
                                        itr, new_model_paths[key], log=False)
                            all_samples_data_maml_iter.append(samples_data)

                            # for logging purposes
                            _, mean_reward = self.process_samples_for_policy(
                                itr,
                                flatten_list(new_model_paths.values()),
                                log='reward',
                                log_prefix="DynTrajs%i%s-" %
                                (maml_itr + 1, chr(97 + step)),
                                return_reward=True)

                            times_dyn_sample_processing.append(
                                time.time() - time_dyn_sample_processing_start)
                            ''' --------------- Inner Policy Update --------------- '''

                            time_inner_step_start = time.time()

                            if step < self.num_grad_updates:
                                logger.log("Computing policy updates...")
                                self.policy.compute_updated_dists(samples_data)

                            times_inner_step.append(time.time() -
                                                    time_inner_step_start)

                        if maml_itr == 0:
                            prev_rolling_reward_mean = mean_reward
                            rolling_reward_mean = mean_reward
                        else:
                            prev_rolling_reward_mean = rolling_reward_mean
                            rolling_reward_mean = 0.8 * rolling_reward_mean + 0.2 * mean_reward

                        # stop gradient steps when mean_reward decreases
                        if self.retrain_model_when_reward_decreases and rolling_reward_mean < prev_rolling_reward_mean:
                            logger.log(
                                "Stopping policy gradients steps since rolling mean reward decreased from %.2f to %.2f"
                                % (prev_rolling_reward_mean,
                                   rolling_reward_mean))
                            # complete some logging stuff
                            for i in range(maml_itr + 1,
                                           self.num_maml_steps_per_iter):
                                logger.record_tabular(
                                    'DynTrajs%ia-AverageReturn' % (i + 1), 0.0)
                                logger.record_tabular(
                                    'DynTrajs%ib-AverageReturn' % (i + 1), 0.0)
                            break
                        ''' --------------- Meta Policy Update --------------- '''

                        logger.log(
                            "MAML Step %i of %i - Optimizing policy..." %
                            (maml_itr + 1, self.num_maml_steps_per_iter))
                        time_outer_step_start = time.time()

                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        self.optimize_policy(itr,
                                             all_samples_data_maml_iter,
                                             log=False)
                        if itr == 0: sess.graph.finalize()

                        times_outer_step.append(time.time() -
                                                time_outer_step_start)
                    ''' --------------- Logging Stuff --------------- '''

                    logger.record_tabular('Time-MAMLSteps',
                                          time.time() - time_maml_steps_start)
                    logger.record_tabular('Time-DynSampling',
                                          np.mean(times_dyn_sampling))
                    logger.record_tabular('Time-DynSampleProc',
                                          np.mean(times_dyn_sample_processing))
                    logger.record_tabular('Time-InnerStep',
                                          np.mean(times_inner_step))
                    logger.record_tabular('Time-OuterStep',
                                          np.mean(times_outer_step))

                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(
                        itr, all_samples_data_maml_iter[-1])  # , **kwargs)
                    if self.store_paths:
                        params["paths"] = all_samples_data_maml_iter[-1][
                            "paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time-Overall',
                                          time.time() - start_time)
                    logger.record_tabular('Time-Itr',
                                          time.time() - itr_start_time)

                    logger.dump_tabular(with_prefix=False)

            self.shutdown_worker()
示例#24
0
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path["rewards"], self.discount) for path in paths]
        )

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(np.square(np.concatenate(
            [path["actions"] for path in paths]
        )))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True)
        )
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True)
        )

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn',
                              np.mean(returns))
        logger.record_tabular('StdReturn',
                              np.std(returns))
        logger.record_tabular('MaxReturn',
                              np.max(returns))
        logger.record_tabular('MinReturn',
                              np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm',
                              policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm',
                              qfun_reg_param_norm)

        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []
示例#25
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        if not self.transfer:
            sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            if itr == 0 or itr % self.policy_save_interval == 0:
                params = dict(
                    params1=self.policy.get_param_values(),
                    params2=self.policy2.get_param_values(),
                )
                joblib.dump(params,
                            self.policy_path + '/params' + str(itr) + '.pkl',
                            compress=3)

            itr_start_time = time.time()

            for n1 in range(self.N1):
                with logger.prefix('itr #%d ' % itr + 'n1 #%d |' % n1):
                    logger.log("training policy 1...")
                    logger.log("Obtaining samples...")
                    paths = self.obtain_samples(itr, 1)
                    logger.log("Processing samples...")
                    samples_data = self.process_samples(itr, paths, 1)

                    if self.record_rewards:
                        undiscounted_returns = [
                            sum(path["rewards"]) for path in paths
                        ]
                        average_discounted_return = np.mean(
                            [path["returns"][0] for path in paths])
                        AverageReturn = np.mean(undiscounted_returns)
                        StdReturn = np.std(undiscounted_returns)
                        MaxReturn = np.max(undiscounted_returns)
                        MinReturn = np.min(undiscounted_returns)
                        self.rewards['average_discounted_return1'].append(
                            average_discounted_return)
                        self.rewards['AverageReturn1'].append(AverageReturn)
                        self.rewards['StdReturn1'].append(StdReturn)
                        self.rewards['MaxReturn1'].append(MaxReturn)
                        self.rewards['MinReturn1'].append(MinReturn)

                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths, 1)
                    logger.log("Optimizing policy...")
                    self.optimize_policy(itr, samples_data, 1)

                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)
                    logger.dump_tabular(with_prefix=False)

            for n2 in range(self.N2):
                if itr != self.n_itr - 1:  #don't train adversary at last time
                    with logger.prefix('itr #%d ' % itr + 'n2 #%d |' % n2):
                        logger.log("training policy 2...")
                        logger.log("Obtaining samples...")
                        paths = self.obtain_samples(itr, 2)
                        logger.log("Processing samples...")
                        samples_data = self.process_samples(itr, paths, 2)

                        if self.record_rewards:
                            undiscounted_returns = [
                                sum(path["rewards"]) for path in paths
                            ]
                            average_discounted_return = np.mean(
                                [path["returns"][0] for path in paths])
                            AverageReturn = np.mean(undiscounted_returns)
                            StdReturn = np.std(undiscounted_returns)
                            MaxReturn = np.max(undiscounted_returns)
                            MinReturn = np.min(undiscounted_returns)
                            self.rewards['average_discounted_return2'].append(
                                average_discounted_return)
                            self.rewards['AverageReturn2'].append(
                                AverageReturn)
                            self.rewards['StdReturn2'].append(StdReturn)
                            self.rewards['MaxReturn2'].append(MaxReturn)
                            self.rewards['MinReturn2'].append(MinReturn)

                        logger.log("Logging diagnostics...")
                        self.log_diagnostics(paths, 2)
                        logger.log("Optimizing policy...")
                        self.optimize_policy(itr, samples_data, 2)

                        logger.record_tabular('Time', time.time() - start_time)
                        logger.record_tabular('ItrTime',
                                              time.time() - itr_start_time)
                        logger.dump_tabular(with_prefix=False)

            logger.log("Saving snapshot...")
            params = self.get_itr_snapshot(itr)  # , **kwargs)
            logger.save_itr_params(itr, params)
            logger.log("Saved")
            # logger.record_tabular('Time', time.time() - start_time)
            # logger.record_tabular('ItrTime', time.time() - itr_start_time)
            # logger.dump_tabular(with_prefix=False)

        self.shutdown_worker()
        if created_session:
            sess.close()
示例#26
0
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        paths = rollouts(self._eval_env, self.policy, self._max_path_length,
                         self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))
        logger.record_tabular('epoch', epoch)

        self._eval_env.log_diagnostics(paths)
        if self._eval_render:
            self._eval_env.render(paths)

        batch = self.pool.random_batch(self._batch_size)
        self.log_diagnostics(batch)
示例#27
0
文件: irl.py 项目: pidchen/atari-irl
    def fit(self,
            paths,
            policy=None,
            batch_size=256,
            logger=None,
            lr=1e-3,
            itr=0,
            **kwargs):
        if isinstance(self.expert_trajs[0], dict):
            print("Warning: Processing state out of dictionary")
            self._insert_next_state(self.expert_trajs)
            expert_obs_base, expert_obs_next_base, expert_acts, expert_acts_next = \
                self.extract_paths(self.expert_trajs, keys=(
                    'observations', 'observations_next',
                    'actions', 'actions_next'
                ))
        else:
            expert_obs_base, expert_obs_next_base, expert_acts, expert_acts_next, _ = \
                self.expert_trajs

        #expert_probs = paths.sampler.get_a_logprobs(
        obs, obs_next, acts, acts_next, path_probs = paths.extract_paths(
            ('observations', 'observations_next', 'actions', 'actions_next',
             'a_logprobs'),
            obs_modifier=self.modify_obs)

        expert_obs = expert_obs_base
        expert_obs_next = expert_obs_next_base

        raw_discrim_scores = []
        # Train discriminator
        for it in TrainingIterator(self.max_itrs, heartbeat=5):
            nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \
                self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size)

            nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch = \
                self.sample_batch(
                    expert_obs_next,
                    expert_obs,
                    expert_acts_next,
                    expert_acts,
                    # expert_probs,
                    batch_size=batch_size
                )
            expert_lprobs_batch = paths.sampler.get_a_logprobs(
                expert_obs_batch, expert_act_batch)

            expert_obs_batch = self.modify_obs(expert_obs_batch)
            nexpert_obs_batch = self.modify_obs(nexpert_obs_batch)
            if self.encoder:
                expert_obs_batch = self.encode_fn(
                    expert_obs_batch, expert_act_batch.argmax(axis=1))
                nexpert_obs_batch = self.encode_fn(
                    nexpert_obs_batch, nexpert_act_batch.argmax(axis=1))

            # Build feed dict
            labels = np.zeros((batch_size * 2, 1))
            labels[batch_size:] = 1.0
            obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0)
            nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch],
                                        axis=0)
            act_batch = np.concatenate([act_batch, expert_act_batch], axis=0)
            nact_batch = np.concatenate([nact_batch, nexpert_act_batch],
                                        axis=0)
            lprobs_batch = np.expand_dims(np.concatenate(
                [lprobs_batch, expert_lprobs_batch], axis=0),
                                          axis=1).astype(np.float32)

            feed_dict = {
                self.act_t: act_batch,
                self.obs_t: obs_batch,
                self.nobs_t: nobs_batch,
                self.nact_t: nact_batch,
                self.labels: labels,
                self.lprobs: lprobs_batch,
                self.lr: lr
            }

            loss, _, acc, scores = tf.get_default_session().run(
                [
                    self.loss, self.step, self.update_accuracy,
                    self.discrim_output
                ],
                feed_dict=feed_dict)
            # we only want the average score for the non-expert demos
            non_expert_slice = slice(0, batch_size)
            score, raw_score = self._process_discrim_output(
                scores[non_expert_slice])
            assert len(score) == batch_size
            assert np.sum(labels[non_expert_slice]) == 0
            raw_discrim_scores.append(raw_score)

            it.record('loss', loss)
            it.record('accuracy', acc)
            it.record('avg_score', np.mean(score))
            if it.heartbeat:
                print(it.itr_message())
                mean_loss = it.pop_mean('loss')
                print('\tLoss:%f' % mean_loss)
                mean_acc = it.pop_mean('accuracy')
                print('\tAccuracy:%f' % mean_acc)
                mean_score = it.pop_mean('avg_score')

        if logger:
            logger.record_tabular('GCLDiscrimLoss', mean_loss)
            logger.record_tabular('GCLDiscrimAccuracy', mean_acc)
            logger.record_tabular('GCLMeanScore', mean_score)

        # set the center for our normal distribution
        scores = np.hstack(raw_discrim_scores)
        self.score_std = np.std(scores)
        self.score_mean = np.mean(scores)

        return mean_loss
示例#28
0
    def train(self):
        with tf.Session() as sess:
            if self.load_policy is not None:
                import joblib
                self.policy = joblib.load(self.load_policy)['policy']
            self.init_opt()
            # initialize uninitialized vars (I know, it's ugly)
            uninit_vars = []
            for var in tf.all_variables():
                try:
                    sess.run(var)
                except tf.errors.FailedPreconditionError:
                    uninit_vars.append(var)
            sess.run(tf.initialize_variables(uninit_vars))
            #sess.run(tf.initialize_all_variables())
            self.start_worker()
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):

                    logger.log("Obtaining samples...")
                    paths = self.obtain_samples(itr)
                    logger.log("Processing samples...")
                    samples_data = self.process_samples(itr, paths)
                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths)
                    logger.log("Optimizing policy...")
                    self.optimize_policy(itr, samples_data)
                    #new_param_values = self.policy.get_variable_values(self.policy.all_params)

                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(itr, samples_data)  # , **kwargs)
                    if self.store_paths:
                        params["paths"] = samples_data["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime', time.time() - itr_start_time)

                    #import pickle
                    #with open('paths_itr'+str(itr)+'.pkl', 'wb') as f:
                    #    pickle.dump(paths, f)

                    # debugging
                    """
                    if itr % 1 == 0:
                        logger.log("Saving visualization of paths")
                        import matplotlib.pyplot as plt;
                        for ind in range(5):
                            plt.clf(); plt.hold(True)
                            points = paths[ind]['observations']
                            plt.plot(points[:,0], points[:,1], '-r', linewidth=2)
                            plt.xlim([-1.0, 1.0])
                            plt.ylim([-1.0, 1.0])
                            plt.legend(['path'])
                            plt.savefig('/home/cfinn/path'+str(ind)+'.png')
                    """
                    # end debugging

                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        self.update_plot()
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")
        self.shutdown_worker()
示例#29
0
文件: gmm.py 项目: tomasruizt/sac
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        sess = tf_utils.get_default_session()
        mus, log_sigs, log_ws, log_pis = sess.run(
            (
                self.distribution.mus_t,
                self.distribution.log_sigs_t,
                self.distribution.log_ws_t,
                self.distribution.log_p_t,
            ),
            feeds
        )

        logger.record_tabular('gmm-mus-mean', np.mean(mus))
        logger.record_tabular('gmm-mus-min', np.min(mus))
        logger.record_tabular('gmm-mus-max', np.max(mus))
        logger.record_tabular('gmm-mus-std', np.std(mus))
        logger.record_tabular('gmm-log-w-mean', np.mean(log_ws))
        logger.record_tabular('gmm-log-w-min', np.min(log_ws))
        logger.record_tabular('gmm-log-w-max', np.max(log_ws))
        logger.record_tabular('gmm-log-w-std', np.std(log_ws))
        logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs))
        logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs))
        logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs))
        logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs))
        logger.record_tabular('log_pi_mean', np.mean(log_pis))
        logger.record_tabular('log_pi_max', np.max(log_pis))
        logger.record_tabular('log_pi_min', np.min(log_pis))
示例#30
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env, goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                         itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                         bounds=v['goal_range'])
    logger.log('Saving to report')
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1,
                         stop_threshold=v['stop_threshold'], start_generation=False)

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16, 16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )

    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)
    if v['baseline'] == 'g_mlp':
        baseline_alice = GaussianMLPBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals, t_alices = generate_states_alice(env_alice=env_alice, algo_alice=algo_alice,
                                                    num_new_states=v['num_new_goals'], log_dir=log_dir,
                                                    start_generation=False)

        if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
            old_goals = all_goals.sample(v['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals

        with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'],
                )
            )

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            all_paths = algo.train()

        [goals, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached')

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
                             itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'],
                             bounds=v['goal_range'])

        # logger.log("Labeling the goals")
        # labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached')

        plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'],
                            center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1]
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'],
                                                    horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
示例#31
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()
        if not self.transfer:
            sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                # self.env._wrapped_env.generate_grid=True
                # self.env._wrapped_env.generate_b0_start_goal=True
                # self.env.reset()
                # self.env._wrapped_env.generate_grid=False
                # self.env._wrapped_env.generate_b0_start_goal=False
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)

                if self.record_rewards:
                    logger.log("recording rewards...")
                    undiscounted_returns = [
                        sum(path["rewards"]) for path in paths
                    ]
                    average_discounted_return = np.mean(
                        [path["returns"][0] for path in paths])
                    AverageReturn = np.mean(undiscounted_returns)
                    StdReturn = np.std(undiscounted_returns)
                    MaxReturn = np.max(undiscounted_returns)
                    MinReturn = np.min(undiscounted_returns)
                    self.rewards['average_discounted_return'].append(
                        average_discounted_return)
                    self.rewards['AverageReturn'].append(AverageReturn)
                    self.rewards['StdReturn'].append(StdReturn)
                    self.rewards['MaxReturn'].append(MaxReturn)
                    self.rewards['MinReturn'].append(MinReturn)
                    print("AverageReturn: ", AverageReturn)
                    print("MaxReturn: ", MaxReturn)
                    print("MinReturn: ", MinReturn)
                    # print("returns: ",samples_data["returns"])
                    # print("valids: ",samples_data["valids"])

                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
        self.shutdown_worker()
        if created_session:
            sess.close()
示例#32
0
 def log_diagnostics(self, paths):
     if len(paths) > 0:
         progs = [
             path["observations"][-1][-3] - path["observations"][0][-3]
             for path in paths
         ]
         logger.record_tabular('AverageForwardProgress', np.mean(progs))
         logger.record_tabular('MaxForwardProgress', np.max(progs))
         logger.record_tabular('MinForwardProgress', np.min(progs))
         logger.record_tabular('StdForwardProgress', np.std(progs))
     else:
         logger.record_tabular('AverageForwardProgress', np.nan)
         logger.record_tabular('MaxForwardProgress', np.nan)
         logger.record_tabular('MinForwardProgress', np.nan)
         logger.record_tabular('StdForwardProgress', np.nan)
示例#33
0
    def evaluate(self, epoch, memory):

        if epoch == self.n_epochs - 1:
            logger.log("Collecting samples for evaluation")
            rewards = sample_rewards(env=self.env,
                                     policy=self.policy,
                                     eval_samples=self.eval_samples,
                                     max_path_length=self.max_path_length)
            average_discounted_return = np.mean(
                [discount_return(reward, self.discount) for reward in rewards])
            returns = [sum(reward) for reward in rewards]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_qfunc_loss = np.mean(self.qfunc_loss_averages)
        average_policy_loss = np.mean(self.policy_loss_averages)

        logger.record_tabular('Epoch', epoch)
        if epoch == self.n_epochs - 1:
            logger.record_tabular('AverageReturn',
                              np.mean(returns))
            logger.record_tabular('StdReturn',
                              np.std(returns))
            logger.record_tabular('MaxReturn',
                              np.max(returns))
            logger.record_tabular('MinReturn',
                              np.min(returns))
            logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        if len(self.strategy_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.strategy_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.strategy_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.strategy_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.strategy_path_returns))
        logger.record_tabular('AverageQLoss', average_qfunc_loss)
        logger.record_tabular('AveragePolicyLoss', average_policy_loss)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))

        self.qfunc_loss_averages = []
        self.policy_loss_averages = []
        self.q_averages = []
        self.y_averages = []
        self.strategy_path_returns = []
示例#34
0
文件: cem.py 项目: ermongroup/MetaIRL
    def train(self):
        parallel_sampler.populate_task(self.env, self.policy)
        if self.plot:
            plotter.init_plot(self.env, self.policy)

        cur_std = self.init_std
        cur_mean = self.policy.get_param_values()
        # K = cur_mean.size
        n_best = max(1, int(self.n_samples * self.best_frac))

        for itr in range(self.n_itr):
            # sample around the current distribution
            extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0)
            sample_std = np.sqrt(
                np.square(cur_std) +
                np.square(self.extra_std) * extra_var_mult)
            if self.batch_size is None:
                criterion = 'paths'
                threshold = self.n_samples
            else:
                criterion = 'samples'
                threshold = self.batch_size
            infos = stateful_pool.singleton_pool.run_collect(
                _worker_rollout_policy,
                threshold=threshold,
                args=(dict(cur_mean=cur_mean,
                           sample_std=sample_std,
                           max_path_length=self.max_path_length,
                           discount=self.discount,
                           criterion=criterion,
                           n_evals=self.n_evals), ))
            xs = np.asarray([info[0] for info in infos])
            paths = [info[1] for info in infos]

            fs = np.array([path['returns'][0] for path in paths])
            print((xs.shape, fs.shape))
            best_inds = (-fs).argsort()[:n_best]
            best_xs = xs[best_inds]
            cur_mean = best_xs.mean(axis=0)
            cur_std = best_xs.std(axis=0)
            best_x = best_xs[0]
            logger.push_prefix('itr #%d | ' % itr)
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('CurStdMean', np.mean(cur_std))
            undiscounted_returns = np.array(
                [path['undiscounted_return'] for path in paths])
            logger.record_tabular('AverageReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('StdReturn', np.std(undiscounted_returns))
            logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular('MinReturn', np.min(undiscounted_returns))
            logger.record_tabular('AverageDiscountedReturn', np.mean(fs))
            logger.record_tabular('NumTrajs', len(paths))
            paths = list(chain(
                *[d['full_paths']
                  for d in paths]))  #flatten paths for the case n_evals > 1
            logger.record_tabular(
                'AvgTrajLen',
                np.mean([len(path['returns']) for path in paths]))

            self.policy.set_param_values(best_x)
            self.env.log_diagnostics(paths)
            self.policy.log_diagnostics(paths)
            logger.save_itr_params(
                itr,
                dict(
                    itr=itr,
                    policy=self.policy,
                    env=self.env,
                    cur_mean=cur_mean,
                    cur_std=cur_std,
                ))
            logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()
            if self.plot:
                plotter.update_plot(self.policy, self.max_path_length)
        parallel_sampler.terminate_task()
示例#35
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
示例#36
0
    def train(self):

        cur_std = self.sigma0
        cur_mean = self.policy.get_param_values()
        es = cma_es_lib.CMAEvolutionStrategy(cur_mean, cur_std)

        parallel_sampler.populate_task(self.env, self.policy)
        if self.plot:
            plotter.init_plot(self.env, self.policy)

        cur_std = self.sigma0
        cur_mean = self.policy.get_param_values()

        itr = 0
        while itr < self.n_itr and not es.stop():

            if self.batch_size is None:
                # Sample from multivariate normal distribution.
                xs = es.ask()
                xs = np.asarray(xs)
                # For each sample, do a rollout.
                infos = (stateful_pool.singleton_pool.run_map(
                    sample_return,
                    [(x, self.max_path_length, self.discount) for x in xs]))
            else:
                cum_len = 0
                infos = []
                xss = []
                done = False
                while not done:
                    sbs = stateful_pool.singleton_pool.n_parallel * 2
                    # Sample from multivariate normal distribution.
                    # You want to ask for sbs samples here.
                    xs = es.ask(sbs)
                    xs = np.asarray(xs)

                    xss.append(xs)
                    sinfos = stateful_pool.singleton_pool.run_map(
                        sample_return,
                        [(x, self.max_path_length, self.discount) for x in xs])
                    for info in sinfos:
                        infos.append(info)
                        cum_len += len(info['returns'])
                        if cum_len >= self.batch_size:
                            xs = np.concatenate(xss)
                            done = True
                            break

            # Evaluate fitness of samples (negative as it is minimization
            # problem).
            fs = -np.array([info['returns'][0] for info in infos])
            # When batching, you could have generated too many samples compared
            # to the actual evaluations. So we cut it off in this case.
            xs = xs[:len(fs)]
            # Update CMA-ES params based on sample fitness.
            es.tell(xs, fs)

            logger.push_prefix('itr #%d | ' % itr)
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('CurStdMean', np.mean(cur_std))
            undiscounted_returns = np.array(
                [info['undiscounted_return'] for info in infos])
            logger.record_tabular('AverageReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('StdReturn', np.mean(undiscounted_returns))
            logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular('MinReturn', np.min(undiscounted_returns))
            logger.record_tabular('AverageDiscountedReturn', np.mean(fs))
            logger.record_tabular(
                'AvgTrajLen',
                np.mean([len(info['returns']) for info in infos]))
            self.env.log_diagnostics(infos)
            self.policy.log_diagnostics(infos)

            logger.save_itr_params(
                itr, dict(
                    itr=itr,
                    policy=self.policy,
                    env=self.env,
                ))
            logger.dump_tabular(with_prefix=False)
            if self.plot:
                plotter.update_plot(self.policy, self.max_path_length)
            logger.pop_prefix()
            # Update iteration.
            itr += 1

        # Set final params.
        self.policy.set_param_values(es.result()[0])
        parallel_sampler.terminate_task()
    def compute_updated_dists(self, samples):
        """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy.
        """
        start = time.time()
        num_tasks = len(samples)
        param_keys = self.all_params.keys()
        update_param_keys = param_keys
        no_update_param_keys = []

        sess = tf.get_default_session()

        obs_list, action_list, adv_list = [], [], []
        for i in range(num_tasks):
            inputs = ext.extract(samples[i], 'observations', 'actions',
                                 'advantages')
            obs_list.append(inputs[0])
            action_list.append(inputs[1])
            adv_list.append(inputs[2])

        inputs = obs_list + action_list + adv_list

        # To do a second update, replace self.all_params below with the params that were used to collect the policy.
        init_param_values = None
        if self.all_param_vals is not None:
            init_param_values = self.get_variable_values(self.all_params)

        step_size = self.step_size
        for i in range(num_tasks):
            if self.all_param_vals is not None:
                self.assign_params(self.all_params, self.all_param_vals[i])

        if 'all_fast_params_tensor' not in dir(self):
            # make computation graph once
            self.all_fast_params_tensor = []
            for i in range(num_tasks):
                gradients = dict(
                    zip(
                        update_param_keys,
                        tf.gradients(self.surr_objs[i], [
                            self.all_params[key] for key in update_param_keys
                        ])))
                fast_params_tensor = OrderedDict(
                    zip(update_param_keys, [
                        self.all_params[key] - step_size * gradients[key]
                        for key in update_param_keys
                    ]))
                for k in no_update_param_keys:
                    fast_params_tensor[k] = self.all_params[k]
                self.all_fast_params_tensor.append(fast_params_tensor)

        # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values
        # these are the updated values of the params after the gradient step
        self.all_param_vals = sess.run(
            self.all_fast_params_tensor,
            feed_dict=dict(list(zip(self.input_list_for_grad, inputs))))

        if init_param_values is not None:
            self.assign_params(self.all_params, init_param_values)

        outputs = []
        self._cur_f_dist_i = {}
        inputs = tf.split(self.input_tensor, num_tasks, 0)
        for i in range(num_tasks):
            # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time.
            task_inp = inputs[i]
            info, _ = self.dist_info_sym(task_inp,
                                         dict(),
                                         all_params=self.all_param_vals[i],
                                         is_training=False)

            outputs.append([info['mean'], info['log_std']])

        self._cur_f_dist = tensor_utils.compile_function(
            inputs=[self.input_tensor],
            outputs=outputs,
        )
        total_time = time.time() - start
        logger.record_tabular("ComputeUpdatedDistTime", total_time)
示例#38
0
    def optimize_policy(self, itr, all_samples_data):
        assert len(
            all_samples_data
        ) == self.num_grad_updates + 1  # we collected the rollouts to compute the grads and then the test!

        if not self.use_maml:
            all_samples_data = [all_samples_data[0]]

        input_list = []

        # Code to account for off-policy sampling when more than 1 beta steps
        theta0_dist_info_list = []
        for i in range(self.meta_batch_size):
            if 'agent_infos_orig' not in all_samples_data[0][i].keys():
                assert False, "agent_infos_orig is missing--this should have been handled in batch_maml_polopt"
            else:
                agent_infos_orig = all_samples_data[0][i]['agent_infos_orig']
            theta0_dist_info_list += [
                agent_infos_orig[k]
                for k in self.policy.distribution.dist_info_keys
            ]
        input_list += tuple(theta0_dist_info_list)

        theta_l_dist_info_list = []
        for i in range(self.meta_batch_size):
            agent_infos = all_samples_data[0][i]['agent_infos']
            theta_l_dist_info_list += [
                agent_infos[k] for k in self.policy.distribution.dist_info_keys
            ]
        input_list += tuple(theta_l_dist_info_list)

        for step in range(
                len(all_samples_data)):  # these are the gradient steps
            obs_list, action_list, adv_list = [], [], []
            for i in range(self.meta_batch_size):

                inputs = ext.extract(all_samples_data[step][i], "observations",
                                     "actions", "advantages")
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                adv_list.append(inputs[2])
            input_list += obs_list + action_list + adv_list  # [ [obs_0], [act_0], [adv_0], [obs_1], ... ]
            if step == 0:  ##CF not used?
                init_inputs = input_list

        if self.use_maml:
            dist_info_list = []
            for i in range(self.meta_batch_size):
                agent_infos = all_samples_data[self.kl_constrain_step][i][
                    'agent_infos']  ##kl_constrain_step default is -1, meaning post all alpha grad updates
                dist_info_list += [
                    agent_infos[k]
                    for k in self.policy.distribution.dist_info_keys
                ]
            input_list += tuple(dist_info_list)
            logger.log("Computing KL before")
            mean_kl_before = self.optimizer.constraint_val(input_list)

        logger.log("Computing loss before")
        loss_before = self.optimizer.loss(input_list)
        if itr not in TESTING_ITRS:
            logger.log("Optimizing")
            self.optimizer.optimize(input_list)
        else:
            logger.log("Not Optimizing")
        logger.log("Computing loss after")
        loss_after = self.optimizer.loss(input_list)
        if self.use_maml:
            logger.log("Computing KL after")
            mean_kl = self.optimizer.constraint_val(input_list)
            logger.record_tabular('MeanKLBefore',
                                  mean_kl_before)  # this now won't be 0!
            logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()
示例#39
0
文件: base.py 项目: rcorona/sac
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        with self._policy.deterministic(self._eval_deterministic):
            paths = rollouts(
                self._eval_env,
                self._policy,
                self.sampler._max_path_length,
                self._eval_n_episodes,
            )

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        self._eval_env.log_diagnostics(paths)
        if self._eval_render:
            self._eval_env.render(paths)

        iteration = epoch * self._epoch_length
        batch = self.sampler.random_batch()
        self.log_diagnostics(iteration, batch)
示例#40
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv())

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal'])

    load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/'
    save_dir = 'data/debug/'
    # with open(os.path.join(config.PROJECT_PATH, save_dir, "test.pkl"), 'wb') as handle:
    #     pickle.dump({}, handle)

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v["baseline"] == "MLP":
        baseline = GaussianMLPBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    # load the state collection from data_upload

    all_starts = StateCollection(distance_threshold=v['coll_eps'],
                                 states_transform=lambda x: x[:, :2])

    # initial brownian horizon and size are pretty important
    logger.log("Brownian horizon: {}".format(v['initial_brownian_horizon']))
    seed_starts = generate_starts(
        env,
        starts=[v['start_goal']],
        horizon=v['initial_brownian_horizon'],
        size=15000,
        variance=v['brownian_variance'],
        animated=False,
    )

    if v['filter_bad_starts']:
        logger.log("Prefilter seed starts: {}".format(len(seed_starts)))
        seed_starts = parallel_check_feasibility(
            env=env,
            starts=seed_starts,
            max_path_length=v['feasibility_path_length'])
        logger.log("Filtered seed starts: {}".format(len(seed_starts)))

    # can also filter these starts optionally

    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb'))
    # logger.log("We have %d feasible starts" % all_feasible_starts.size)

    min_reward = 0.1
    max_reward = 0.9
    improvement_threshold = 0
    old_rewards = None

    init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3],
                [4, 4], [3, 4], [2, 4], [1, 4]][::-1]
    for pos in init_pos:
        pos.extend([
            0.55,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            -1,
            0,
            -1,
            0,
            1,
        ])
    init_pos = np.array(init_pos)

    with open(osp.join(log_dir, 'init_pos.json'), 'w') as f:
        json.dump(init_pos.tolist(), f)

    for outer_iter in range(1, v['outer_iters'] + 1):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        report.save()

        # generate starts from the previous seed starts, which are defined below
        starts = generate_starts(env,
                                 starts=seed_starts,
                                 subsample=v['num_new_starts'],
                                 size=2000,
                                 horizon=v['brownian_horizon'],
                                 variance=v['brownian_variance'])

        # note: this messes with the balance between starts and old_starts!
        if v['filter_bad_starts']:
            logger.log("Prefilter starts: {}".format(len(starts)))
            starts = parallel_check_feasibility(
                env=env,
                starts=starts,
                max_path_length=v['feasibility_path_length'])
            logger.log("Filtered starts: {}".format(len(starts)))

        logger.log("Total number of starts in buffer: {}".format(
            all_starts.size))
        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "qval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.q_vals, handle)
            # with open(os.path.join(config.PROJECT_PATH, save_dir, "preval{}.pkl".format(outer_iter)), 'wb') as handle:
            #     pickle.dump(all_starts.prev_vals, handle)
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        # plot starts before training
        # takes too much time
        # labels = label_states(starts, env, policy, v['horizon'],
        #                       as_goals=False, n_traj=v['n_traj'], key='goal_reached')
        # plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
        #                     center=v['goal_center'], maze_id=v['maze_id'],
        #                     summary_string_base='initial starts labels:\n')

        # Following code should be indented
        with ExperimentLogger(log_dir,
                              outer_iter // 50,
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        logger.log("Labeling the starts")

        [starts, labels] = label_states_from_paths(trpo_paths,
                                                   n_traj=v['n_traj'],
                                                   key='goal_reached',
                                                   as_goal=False,
                                                   env=env)

        start_classes, text_labels = convert_label(labels)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_starts)

        if v['seed_with'] == 'only_goods':
            if len(
                    filtered_raw_starts
            ) > 0:  # add a ton of noise if all the states I had ended up being high_reward!
                logger.log("We have {} good starts!".format(
                    len(filtered_raw_starts)))
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                logger.log(
                    "More bad starts than good starts, sampling seeds from replay buffer"
                )
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:
                logger.log("More good starts than bad starts, resampling")
                seed_starts = generate_starts(env,
                                              starts=starts,
                                              horizon=v['horizon'] * 2,
                                              subsample=v['num_new_starts'],
                                              size=10000,
                                              variance=v['brownian_variance'] *
                                              10)

        elif v['seed_with'] == 'all_previous':
            seed_starts = starts
            filtered_raw_starts = starts  # no filtering done
        else:
            raise Exception

        # need to put this last! otherwise labels variable gets confused
        logger.log("Labeling on uniform starts")
        if not v["debug"]:
            # with logger.tabular_prefix("Uniform_"):
            #     unif_starts = all_feasible_starts.sample(100)
            #     mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached',
            #                                          as_goals=False, full_path=True)
            #     env.log_diagnostics(paths)
            #     mean_rewards = mean_reward.reshape(-1, 1)
            #     labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward,
            #                             improvement_threshold=improvement_threshold)
            #     logger.log("Starts labelled")
            #     plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'],
            #                         center=v['goal_center'], maze_id=v['maze_id'],
            #                         summary_string_base='initial starts labels:\n')
            #     report.add_text("Uniform Success: " + str(np.mean(mean_reward)))

            with logger.tabular_prefix("Fixed_"):
                mean_reward, paths = evaluate_states(init_pos,
                                                     env,
                                                     policy,
                                                     v['horizon'],
                                                     n_traj=5,
                                                     key='goal_reached',
                                                     as_goals=False,
                                                     full_path=True)

                with open(
                        osp.join(log_dir,
                                 'init_pos_per_state_mean_return.csv'),
                        'a') as f:
                    writer = csv.writer(f)
                    row = [outer_iter] + list(mean_reward)
                    writer.writerow(row)

                env.log_diagnostics(paths)
                mean_rewards = mean_reward.reshape(-1, 1)
                labels = compute_labels(
                    mean_rewards,
                    old_rewards=old_rewards,
                    min_reward=min_reward,
                    max_reward=max_reward,
                    improvement_threshold=improvement_threshold)
                logger.log("Starts labelled")
                plot_labeled_states(
                    init_pos,
                    labels,
                    report=report,
                    itr=outer_iter,
                    limit=v['goal_range'],
                    center=v['goal_center'],
                    maze_id=v['maze_id'],
                    summary_string_base='initial starts labels:\n')
                report.add_text("Fixed Success: " + str(np.mean(mean_reward)))

            report.new_row()
            report.save()
            logger.record_tabular("Fixed test set_success: ",
                                  np.mean(mean_reward))
            logger.dump_tabular()

        if outer_iter == 1 or outer_iter % 5 == 0 and v.get(
                'scratch_dir', False):
            command = 'rsync -a --delete {} {}'.format(
                os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], ''))
            print("Running command:\n{}".format(command))
            subprocess.run(command.split(), check=True)

    if v.get('scratch_dir', False):
        command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''),
                                          os.path.join(v['scratch_dir'], ''))
        print("Running command:\n{}".format(command))
        subprocess.run(command.split(), check=True)
示例#41
0
文件: base.py 项目: rcorona/sac
    def _train(self, env, policy, initial_exploration_policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy, pool)
            initial_exploration_done = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
示例#42
0
 def log_diagnostics(self, paths):
     log_stds = np.vstack([path["agent_infos"]["log_std"] for path in paths])
     logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))
示例#43
0
 def log_diagnostics(self):
     logger.record_tabular('pool-size', self.pool.size)
示例#44
0
文件: maml_il.py 项目: NH2017/GMPS
    def optimize_policy(self, itr, all_samples_data):
        assert len(
            all_samples_data
        ) >= self.num_grad_updates + 1  # we collected the rollouts to compute the grads and then the test!
        assert self.use_maml

        input_vals_list = []

        # Code to account for off-policy sampling when more than 1 beta steps
        theta0_dist_info_list = []
        for i in range(self.meta_batch_size):
            if 'agent_infos_orig' not in all_samples_data[0][i].keys():
                assert False, "agent_infos_orig is missing--this should have been handled in batch_maml_polopt"
            else:
                agent_infos_orig = all_samples_data[0][i]['agent_infos_orig']
            theta0_dist_info_list += [
                agent_infos_orig[k]
                for k in self.policy.distribution.dist_info_keys
            ]
        input_vals_list += tuple(theta0_dist_info_list)

        for step in range(self.num_grad_updates):
            obs_list, action_list, adv_list, rewards_list, returns_list, path_lengths_list, expert_action_list = [], [], [], [], [], [], []
            for i in range(self.meta_batch_size):  # for each task

                inputs = ext.extract(
                    all_samples_data[step][i],
                    "observations",
                    "actions",
                    "advantages",
                    "expert_actions",
                )
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                adv_list.append(inputs[2])
                expert_action_list.append(inputs[3])

            input_vals_list += obs_list + action_list + adv_list + expert_action_list

        for step in [self.num_grad_updates]:  # last step
            obs_list, action_list, expert_action_list = [], [], [
            ]  # last step's adv_list not currently used in maml_il
            for i in range(self.meta_batch_size):  # for each task
                inputs = ext.extract(
                    all_samples_data[step][i],
                    "observations",
                    "actions",
                    "expert_actions",
                )
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                expert_action_list.append(inputs[2])

            input_vals_list += obs_list + action_list + expert_action_list

        logger.log("Computing loss before")
        # loss_before = self.optimizer.loss(input_vals_list)
        if itr not in TESTING_ITRS:
            steps = self.adam_curve[min(itr, len(self.adam_curve) - 1)]
            logger.log("Optimizing using %s Adam steps on itr %s" %
                       (steps, itr))
            start_loss = self.optimizer.optimize(input_vals_list, steps=steps)
            # self.optimizer.optimize(input_vals_list)
            return start_loss

        else:
            logger.log("Not Optimizing")
            logger.record_tabular("ILLoss", float('nan'))
            return None
示例#45
0
    def train(self):
        # TODO - make this a util
        flatten_list = lambda l: [item for sublist in l for item in sublist]

        with tf.Session() as sess:
            # Code for loading a previous policy. Somewhat hacky because needs to be in sess.
            if self.load_policy is not None:
                import joblib
                self.policy = joblib.load(self.load_policy)['policy']
            self.init_opt()
            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = []
            for var in tf.global_variables():
                # note - this is hacky, may be better way to do this in newer TF.
                try:
                    sess.run(var)
                except tf.errors.FailedPreconditionError:
                    uninit_vars.append(var)
            sess.run(tf.variables_initializer(uninit_vars))

            self.start_worker()
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):
                    logger.log(
                        "Sampling set of tasks/goals for this meta-batch...")

                    env = self.env
                    # print(env)
                    while 'sample_goals' not in dir(env):
                        # print(dir(env))
                        env = env.wrapped_env
                    learner_env_goals = np.array([
                        env.sample_goals() for i in range(self.meta_batch_size)
                    ])  # (self.meta_batch_size)
                    # learner_env_goals = env.sample_goals(self.meta_batch_size)
                    # print(learner_env_goals.shape)
                    # print(learner_env_goals.type)
                    # learner_env_goals = None
                    self.policy.switch_to_init_dist(
                    )  # Switch to pre-update policy

                    all_samples_data, all_paths = [], []
                    for step in range(self.num_grad_updates + 1):
                        #if step > 0:
                        #    import pdb; pdb.set_trace() # test param_vals functions.
                        logger.log('** Step ' + str(step) + ' **')
                        logger.log("Obtaining samples...")
                        paths = self.obtain_samples(
                            itr,
                            reset_args=learner_env_goals,
                            log_prefix=str(step))
                        all_paths.append(paths)
                        logger.log("Processing samples...")
                        samples_data = {}
                        for key in paths.keys():  # the keys are the tasks
                            # don't log because this will spam the consol with every task.
                            samples_data[key] = self.process_samples(
                                itr, paths[key], log=False)
                        all_samples_data.append(samples_data)
                        # for logging purposes only
                        self.process_samples(itr,
                                             flatten_list(paths.values()),
                                             prefix=str(step),
                                             log=True)
                        logger.log("Logging diagnostics...")
                        self.log_diagnostics(flatten_list(paths.values()),
                                             prefix=str(step))
                        if step < self.num_grad_updates:
                            logger.log("Computing policy updates...")
                            self.policy.compute_updated_dists(samples_data)

                    logger.log("Optimizing policy...")
                    # This needs to take all samples_data so that it can construct graph for meta-optimization.
                    self.optimize_policy(itr, all_samples_data)
                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(
                        itr, all_samples_data[-1])  # , **kwargs)
                    if self.store_paths:
                        params["paths"] = all_samples_data[-1]["paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)

                    logger.dump_tabular(with_prefix=False)

                    # The rest is some example plotting code.
                    # Plotting code is useful for visualizing trajectories across a few different tasks.
                    if False and itr % 2 == 0 and self.env.observation_space.shape[
                            0] <= 4:  # point-mass
                        logger.log("Saving visualization of paths")
                        for ind in range(min(5, self.meta_batch_size)):
                            plt.clf()
                            plt.plot(learner_env_goals[ind][0],
                                     learner_env_goals[ind][1],
                                     'k*',
                                     markersize=10)
                            plt.hold(True)

                            preupdate_paths = all_paths[0]
                            postupdate_paths = all_paths[-1]

                            pre_points = preupdate_paths[ind][0][
                                'observations']
                            post_points = postupdate_paths[ind][0][
                                'observations']
                            plt.plot(pre_points[:, 0],
                                     pre_points[:, 1],
                                     '-r',
                                     linewidth=2)
                            plt.plot(post_points[:, 0],
                                     post_points[:, 1],
                                     '-b',
                                     linewidth=1)

                            pre_points = preupdate_paths[ind][1][
                                'observations']
                            post_points = postupdate_paths[ind][1][
                                'observations']
                            plt.plot(pre_points[:, 0],
                                     pre_points[:, 1],
                                     '--r',
                                     linewidth=2)
                            plt.plot(post_points[:, 0],
                                     post_points[:, 1],
                                     '--b',
                                     linewidth=1)

                            pre_points = preupdate_paths[ind][2][
                                'observations']
                            post_points = postupdate_paths[ind][2][
                                'observations']
                            plt.plot(pre_points[:, 0],
                                     pre_points[:, 1],
                                     '-.r',
                                     linewidth=2)
                            plt.plot(post_points[:, 0],
                                     post_points[:, 1],
                                     '-.b',
                                     linewidth=1)

                            plt.plot(0, 0, 'k.', markersize=5)
                            plt.xlim([-0.8, 0.8])
                            plt.ylim([-0.8, 0.8])
                            plt.legend(
                                ['goal', 'preupdate path', 'postupdate path'])
                            plt.savefig(
                                osp.join(logger.get_snapshot_dir(),
                                         'prepost_path' + str(ind) + '.png'))
                    elif False and itr % 2 == 0:  # swimmer or cheetah
                        logger.log("Saving visualization of paths")
                        for ind in range(min(5, self.meta_batch_size)):
                            plt.clf()
                            goal_vel = learner_env_goals[ind]
                            plt.title('Swimmer paths, goal vel=' +
                                      str(goal_vel))
                            plt.hold(True)

                            prepathobs = all_paths[0][ind][0]['observations']
                            postpathobs = all_paths[-1][ind][0]['observations']
                            plt.plot(prepathobs[:, 0],
                                     prepathobs[:, 1],
                                     '-r',
                                     linewidth=2)
                            plt.plot(postpathobs[:, 0],
                                     postpathobs[:, 1],
                                     '--b',
                                     linewidth=1)
                            plt.plot(prepathobs[-1, 0],
                                     prepathobs[-1, 1],
                                     'r*',
                                     markersize=10)
                            plt.plot(postpathobs[-1, 0],
                                     postpathobs[-1, 1],
                                     'b*',
                                     markersize=10)
                            plt.xlim([-1.0, 5.0])
                            plt.ylim([-1.0, 1.0])

                            plt.legend(['preupdate path', 'postupdate path'],
                                       loc=2)
                            plt.savefig(
                                osp.join(
                                    logger.get_snapshot_dir(),
                                    'swim1d_prepost_itr' + str(itr) + '_id' +
                                    str(ind) + '.pdf'))
        self.shutdown_worker()
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean([
            special.discount_return(path["rewards"], self.discount)
            for path in paths
        ])

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(
            np.square(np.concatenate([path["actions"] for path in paths])))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True))
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True))

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn', np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn', np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm)

        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []
示例#47
0
def sample_trajectories(env,
                        policy_in,
                        policy_out,
                        exploration,
                        batch_size,
                        saver,
                        diff_weights,
                        log_dir,
                        logger,
                        is_monitored,
                        monitorpath,
                        sess,
                        max_timestep,
                        render_every=None,
                        cost_np=None,
                        is_done=None):

    saver.save(sess,
               os.path.join(log_dir, 'policy.ckpt'),
               write_meta_graph=False)

    if is_monitored:
        from gym import wrappers
        env = wrappers.Monitor(env, monitorpath)
    Os = []
    As = []
    Rs = []
    max_eps_reward = -np.inf
    min_eps_reward = np.inf
    avg_eps_reward = 0.0
    _counter = 1
    while _counter <= batch_size:
        o = []
        a = []
        r = []
        if is_monitored:
            env.stats_recorder.done = True
        observation = env.reset()
        o.append(observation)
        episode_reward = 0.0
        avg_weight_change = prepare_policy(sess, exploration['param_noise'],
                                           diff_weights,
                                           exploration['initial_param_std'])
        for t in range(max_timestep):
            # Perturb policy.
            if exploration['vary_trajectory_noise']:
                action_noise = exploration['action_noise'] * np.random.uniform(
                )
            else:
                action_noise = exploration['action_noise']
            action = get_action(observation,
                                policy_in,
                                policy_out,
                                sess,
                                action_noise=action_noise,
                                action_bounds=env.action_space.bounds)
            observation, reward, done, info = env.step(action)
            # Debug is_done
            if is_done is not None:
                assert done == is_done(o[-1][None], observation[None])[0]
            o.append(observation)
            a.append(action[0])
            r.append(reward)
            episode_reward += reward
            _counter += 1
            if render_every is not None and len(Os) % render_every == 0:
                env.render()
            if done:
                break
        # debugging cost function
        if cost_np is not None:
            episode_cost = len(a) * cost_np(np.array(o[:-1]), np.array(a),
                                            np.array(o[1:]))
            # Check if cost_np + env_reward == 0
            logger.info(
                '%d steps, cost %.2f, verify_cost %.3f, avg_weight_change %.3f'
                % (_counter - 1, episode_cost, episode_reward + episode_cost,
                   avg_weight_change))
        else:
            logger.info('%d steps, reward %.2f, avg_weight_change %.3f' %
                        (_counter - 1, episode_reward, avg_weight_change))
        # Recover policy
        saver.restore(sess, os.path.join(log_dir, 'policy.ckpt'))
        logger.debug("Restored the policy back to %s" %
                     os.path.join(log_dir, 'policy.ckpt'))

        Os.append(o)
        As.append(a)
        Rs.append(r)
        # Update stats
        avg_eps_reward += episode_reward
        if episode_reward > max_eps_reward:
            max_eps_reward = episode_reward
        if episode_reward < min_eps_reward:
            min_eps_reward = episode_reward

    avg_eps_reward /= len(Os)
    rllab_logger.record_tabular('EpisodesCollected', len(Os))
    rllab_logger.record_tabular('TimeStepsCollected', _counter - 1)
    return Os, As, Rs, {
        'avg_eps_reward': avg_eps_reward,
        'min_eps_reward': min_eps_reward,
        'max_eps_reward': max_eps_reward
    }
示例#48
0
文件: base.py 项目: flyers/rllab
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.algo.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def optimize(self, inputs, extra_inputs=None, subsample_grouped_inputs=None):

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()

        if self._subsample_factor < 1:
            if subsample_grouped_inputs is None:
                subsample_grouped_inputs = [inputs]
            subsample_inputs = tuple()
            for inputs_grouped in subsample_grouped_inputs:
                n_samples = len(inputs_grouped[0])
                inds = np.random.choice(
                    n_samples, int(n_samples * self._subsample_factor), replace=False)
                subsample_inputs += tuple([x[inds] for x in inputs_grouped])
        else:
            subsample_inputs = inputs

        logger.log("computing loss before")
        loss_before = sliced_fun(self._opt_fun["f_loss"], self._num_slices)(
            inputs, extra_inputs)
        logger.log("performing update")
        logger.log("computing descent direction")

        flat_g = sliced_fun(self._opt_fun["f_grad"], self._num_slices)(
            inputs, extra_inputs)

        Hx = self._hvp_approach.build_eval(subsample_inputs + extra_inputs)

        descent_direction = krylov.cg(Hx, flat_g, cg_iters=self._cg_iters)

        approx_g = Hx(descent_direction)
        q = descent_direction.dot(approx_g)
        residual = np.sqrt((approx_g - flat_g).dot(approx_g - flat_g))
        rescale = q / (descent_direction.dot(descent_direction))
        logger.record_tabular("OptimDiagnostic_Residual", residual)
        logger.record_tabular("OptimDiagnostic_Rescale", rescale)

        initial_step_size = np.sqrt(
            2.0 * self._max_constraint_val *
            (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8))
        )
        if np.isnan(initial_step_size):
            initial_step_size = 1.
        flat_descent_step = initial_step_size * descent_direction

        logger.log("descent direction computed")

        prev_param = np.copy(self._target.get_param_values(trainable=True))
        n_iter = 0
        for n_iter, ratio in enumerate(self._backtrack_ratio ** np.arange(self._max_backtracks)):
            cur_step = ratio * flat_descent_step
            cur_param = prev_param - cur_step
            self._target.set_param_values(cur_param, trainable=True)
            loss, constraint_val = sliced_fun(
                self._opt_fun["f_loss_constraint"], self._num_slices)(inputs, extra_inputs)
            if loss < loss_before and constraint_val <= self._max_constraint_val:
                break
        if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before or constraint_val >=
            self._max_constraint_val) and not self._accept_violation:
            logger.log("Line search condition violated. Rejecting the step!")
            if np.isnan(loss):
                logger.log("Violated because loss is NaN")
            if np.isnan(constraint_val):
                logger.log("Violated because constraint %s is NaN" %
                           self._constraint_name)
            if loss >= loss_before:
                logger.log("Violated because loss not improving")
            if constraint_val >= self._max_constraint_val:
                logger.log(
                    "Violated because constraint %s is violated" % self._constraint_name)
            self._target.set_param_values(prev_param, trainable=True)
        logger.log("backtrack iters: %d" % n_iter)
        logger.log("computing loss after")
        logger.log("optimization finished")
    def train(self):
        # TODO - make this a util
        flatten_list = lambda l: [item for sublist in l for item in sublist]
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # with tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})) as sess:
        with tf.Session(config=config) as sess:
            tf.set_random_seed(1)
            # Code for loading a previous policy. Somewhat hacky because needs to be in sess.
            if self.load_policy is not None:
                self.policy = joblib.load(self.load_policy)['policy']

            self.init_opt()
            self.init_experts_opt()
            # initialize uninitialized vars  (only initialize vars that were not loaded)
            uninit_vars = []
            # sess.run(tf.global_variables_initializer())
            for var in tf.global_variables():
                # note - this is hacky, may be better way to do this in newer TF.
                try:
                    sess.run(var)
                except tf.errors.FailedPreconditionError:
                    uninit_vars.append(var)
            sess.run(tf.variables_initializer(uninit_vars))
            self.start_worker()
            start_time = time.time()
            self.metaitr = 0

            self.expertLearning_itrs = [30 * i for i in range(100)]

            expertPaths = []
            for itr in range(self.start_itr, self.n_itr):

                if itr in self.expertLearning_itrs:
                    expertPathsDict = self.trainExperts(self.expert_num_itrs)

                # trainIndices = np.random.choice(np.arange(0, len(self.trainGoals)), self.meta_batch_size, replace = False)
                # curr_trainGoals = self.trainGoals[trainIndices]
                # curr_expertPaths = {i : expertPathsDict[key] for i, key in enumerate(trainIndices)}
                curr_trainGoals = self.trainGoals
                curr_expertPaths = expertPathsDict

                itr_start_time = time.time()
                np.random.seed(self.seed + itr)
                tf.set_random_seed(self.seed + itr)
                rd.seed(self.seed + itr)
                with logger.prefix('itr #%d | ' % itr):
                    all_paths_for_plotting = []
                    all_postupdate_paths = []
                    self.beta_steps = min(
                        self.beta_steps,
                        self.beta_curve[min(itr,
                                            len(self.beta_curve) - 1)])
                    beta_steps_range = range(
                        self.beta_steps
                    ) if itr not in self.testing_itrs else range(
                        self.test_goals_mult)
                    beta0_step0_paths = None
                    num_inner_updates = self.num_grad_updates_for_testing if itr in self.testing_itrs else self.num_grad_updates

                    for beta_step in beta_steps_range:
                        all_samples_data_for_betastep = []
                        print("debug, pre-update std modifier")
                        self.policy.std_modifier = self.pre_std_modifier

                        self.policy.switch_to_init_dist()
                        self.policy.perTask_switch_to_init_dist(
                        )  # Switch to pre-update policy

                        if itr in self.testing_itrs:

                            # env = self.env
                            # while 'sample_goals' not in dir(env):

                            #     env = env.wrapped_env
                            #if self.test_on_training_goals:

                            goals_to_use = curr_trainGoals
                            # else:
                            #     goals_to_use = env.sample_goals(self.meta_batch_size)

                        for step in range(num_inner_updates + 1):  # inner loop
                            logger.log('** Betastep %s ** Step %s **' %
                                       (str(beta_step), str(step)))
                            logger.log("Obtaining samples...")

                            if itr in self.testing_itrs:
                                if step < num_inner_updates:
                                    print(
                                        'debug12.0.0, test-time sampling step=',
                                        step)  #, goals_to_use)
                                    paths = self.obtain_samples(
                                        itr=itr,
                                        reset_args=goals_to_use,
                                        log_prefix=str(beta_step) + "_" +
                                        str(step),
                                        testitr=True,
                                        preupdate=True,
                                        mode='vec')

                                    paths = store_agent_infos(
                                        paths
                                    )  # agent_infos_orig is _taskd here

                                elif step == num_inner_updates:
                                    print(
                                        'debug12.0.1, test-time sampling step=',
                                        step)  #, goals_to_use)

                                    paths = self.obtain_samples(
                                        itr=itr,
                                        reset_args=goals_to_use,
                                        log_prefix=str(beta_step) + "_" +
                                        str(step),
                                        testitr=True,
                                        preupdate=False,
                                        mode=self.updateMode)

                                    all_postupdate_paths.extend(paths.values())

                            elif self.expert_trajs_dir is None or (
                                    beta_step == 0
                                    and step < num_inner_updates):
                                print("debug12.1, regular sampling"
                                      )  #, self.goals_to_use_dict[itr])

                                paths = self.obtain_samples(
                                    itr=itr,
                                    reset_args=curr_trainGoals,
                                    log_prefix=str(beta_step) + "_" +
                                    str(step),
                                    preupdate=True,
                                    mode='vec')

                                if beta_step == 0 and step == 0:
                                    paths = store_agent_infos(
                                        paths
                                    )  # agent_infos_orig is populated here
                                    beta0_step0_paths = deepcopy(paths)
                            elif step == num_inner_updates:
                                print("debug12.2, expert traj")
                                paths = curr_expertPaths

                            else:
                                assert False, "we shouldn't be able to get here"

                            all_paths_for_plotting.append(paths)
                            logger.log("Processing samples...")
                            samples_data = {}

                            for tasknum in paths.keys(
                            ):  # the keys are the tasks
                                # don't log because this will spam the console with every task.

                                if self.use_maml_il and step == num_inner_updates:
                                    fast_process = True
                                else:
                                    fast_process = False
                                if itr in self.testing_itrs:
                                    testitr = True
                                else:
                                    testitr = False
                                samples_data[tasknum] = self.process_samples(
                                    itr,
                                    paths[tasknum],
                                    log=False,
                                    fast_process=fast_process,
                                    testitr=testitr,
                                    metalearn_baseline=self.metalearn_baseline)

                            all_samples_data_for_betastep.append(samples_data)

                            # for logging purposes
                            self.process_samples(
                                itr,
                                flatten_list(paths.values()),
                                prefix=str(step),
                                log=True,
                                fast_process=True,
                                testitr=testitr,
                                metalearn_baseline=self.metalearn_baseline)
                            if itr in self.testing_itrs:
                                self.log_diagnostics(flatten_list(
                                    paths.values()),
                                                     prefix=str(step))

                            if step == num_inner_updates:
                                #ogger.record_tabular("AverageReturnLastTest", self.parallel_sampler.memory["AverageReturnLastTest"],front=True)  #TODO: add functionality for multiple grad steps
                                logger.record_tabular(
                                    "TestItr", ("1" if testitr else "0"),
                                    front=True)
                                logger.record_tabular("MetaItr",
                                                      self.metaitr,
                                                      front=True)

                            if step == num_inner_updates - 1:
                                if itr not in self.testing_itrs:
                                    print(
                                        "debug, post update train std modifier"
                                    )
                                    self.policy.std_modifier = self.post_std_modifier_train * self.policy.std_modifier
                                else:
                                    print(
                                        "debug, post update test std modifier")
                                    self.policy.std_modifier = self.post_std_modifier_test * self.policy.std_modifier
                                if (itr in self.testing_itrs
                                        or not self.use_maml_il
                                        or step < num_inner_updates - 1
                                    ) and step < num_inner_updates:
                                    # do not update on last grad step, and do not update on second to last step when training MAMLIL
                                    logger.log("Computing policy updates...")
                                    self.policy.compute_updated_dists(
                                        samples=samples_data)

                        logger.log("Optimizing policy...")
                        # This needs to take all samples_data so that it can construct graph for meta-optimization.
                        start_loss = self.optimize_policy(
                            itr, all_samples_data_for_betastep)

                    if itr not in self.testing_itrs:
                        self.metaitr += 1
                    logger.log("Saving snapshot...")
                    params = self.get_itr_snapshot(
                        itr, all_samples_data_for_betastep[-1])  # , **kwargs)
                    print("debug123, params", params)
                    if self.store_paths:
                        params["paths"] = all_samples_data_for_betastep[-1][
                            "paths"]
                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)
                    logger.dump_tabular(with_prefix=False)

                    #self.plotTrajs(itr, all_paths_for_plotting)
        self.shutdown_worker()
示例#51
0
    def log_diagnostics(self, paths):
        arm_dists = [p['env_infos'][-1]['arm_distance'] for p in paths]
        goal_dists = [p['env_infos'][-1]['goal_distance'] for p in paths]

        logger.record_tabular('FinalArmDistanceAvg', np.mean(arm_dists))
        logger.record_tabular('FinalArmDistanceMax', np.max(arm_dists))
        logger.record_tabular('FinalArmDistanceMin', np.min(arm_dists))
        logger.record_tabular('FinalArmDistanceStd', np.std(arm_dists))

        logger.record_tabular('FinalGoalDistanceAvg', np.mean(goal_dists))
        logger.record_tabular('FinalGoalDistanceMax', np.max(goal_dists))
        logger.record_tabular('FinalGoalDistanceMin', np.min(goal_dists))
        logger.record_tabular('FinalGoalDistanceStd', np.std(goal_dists))
示例#52
0
    def train(self):

        cur_std = self.sigma0
        cur_mean = self.policy.get_param_values()
        es = cma_es_lib.CMAEvolutionStrategy(
            cur_mean, cur_std)

        parallel_sampler.populate_task(self.env, self.policy)
        if self.plot:
            plotter.init_plot(self.env, self.policy)

        cur_std = self.sigma0
        cur_mean = self.policy.get_param_values()

        itr = 0
        while itr < self.n_itr and not es.stop():

            if self.batch_size is None:
                # Sample from multivariate normal distribution.
                xs = es.ask()
                xs = np.asarray(xs)
                # For each sample, do a rollout.
                infos = (
                    stateful_pool.singleton_pool.run_map(sample_return, [(x, self.max_path_length,
                                                                          self.discount) for x in xs]))
            else:
                cum_len = 0
                infos = []
                xss = []
                done = False
                while not done:
                    sbs = stateful_pool.singleton_pool.n_parallel * 2
                    # Sample from multivariate normal distribution.
                    # You want to ask for sbs samples here.
                    xs = es.ask(sbs)
                    xs = np.asarray(xs)

                    xss.append(xs)
                    sinfos = stateful_pool.singleton_pool.run_map(
                        sample_return, [(x, self.max_path_length, self.discount) for x in xs])
                    for info in sinfos:
                        infos.append(info)
                        cum_len += len(info['returns'])
                        if cum_len >= self.batch_size:
                            xs = np.concatenate(xss)
                            done = True
                            break

            # Evaluate fitness of samples (negative as it is minimization
            # problem).
            fs = - np.array([info['returns'][0] for info in infos])
            # When batching, you could have generated too many samples compared
            # to the actual evaluations. So we cut it off in this case.
            xs = xs[:len(fs)]
            # Update CMA-ES params based on sample fitness.
            es.tell(xs, fs)

            logger.push_prefix('itr #%d | ' % itr)
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('CurStdMean', np.mean(cur_std))
            undiscounted_returns = np.array(
                [info['undiscounted_return'] for info in infos])
            logger.record_tabular('AverageReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('StdReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('MaxReturn',
                                  np.max(undiscounted_returns))
            logger.record_tabular('MinReturn',
                                  np.min(undiscounted_returns))
            logger.record_tabular('AverageDiscountedReturn',
                                  np.mean(fs))
            logger.record_tabular('AvgTrajLen',
                                  np.mean([len(info['returns']) for info in infos]))
            self.env.log_diagnostics(infos)
            self.policy.log_diagnostics(infos)

            logger.save_itr_params(itr, dict(
                itr=itr,
                policy=self.policy,
                env=self.env,
            ))
            logger.dump_tabular(with_prefix=False)
            if self.plot:
                plotter.update_plot(self.policy, self.max_path_length)
            logger.pop_prefix()
            # Update iteration.
            itr += 1

        # Set final params.
        self.policy.set_param_values(es.result()[0])
        parallel_sampler.terminate_task()
示例#53
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
示例#54
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                if self.iter_callback is not None:
                    self.iter_callback(locals(), globals())

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(observation)
                    next_ob, reward, terminal, info = env.step(action)
                    path_length += 1
                    path_return += reward

                    self.pool.add_sample(
                        observation,
                        action,
                        reward,
                        terminal,
                        next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        observation = env.reset()
                        policy.reset()
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        observation = next_ob
                    gt.stamp('sample')

                    if self.pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self.pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self.pool.size)

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
示例#55
0
    def optimize_policy(self, itr, all_samples_data):
        assert len(
            all_samples_data
        ) >= self.num_grad_updates + 1  # we collected the rollouts to compute the grads and then the test!
        assert self.use_maml

        input_vals_list = []

        # Code to account for off-policy sampling when more than 1 beta steps
        theta0_dist_info_list = []
        for i in range(self.meta_batch_size):
            if 'agent_infos_orig' not in all_samples_data[0][i].keys():
                assert False, "agent_infos_orig is missing--this should have been handled in batch_maml_polopt"
            else:
                agent_infos_orig = all_samples_data[0][i]['agent_infos_orig']
            theta0_dist_info_list += [
                agent_infos_orig[k]
                for k in self.policy.distribution.dist_info_keys
            ]
        input_vals_list += tuple(theta0_dist_info_list)

        theta_l_dist_info_list = []
        for i in range(self.meta_batch_size):
            agent_infos = all_samples_data[0][i]['agent_infos']
            theta_l_dist_info_list += [
                agent_infos[k] for k in self.policy.distribution.dist_info_keys
            ]
        input_vals_list += tuple(theta_l_dist_info_list)

        for step in range(self.num_grad_updates):
            obs_list, action_list, adv_list, rewards_list, returns_list, path_lengths_list, expert_action_list = [], [], [], [], [], [], []
            for i in range(self.meta_batch_size):  # for each task
                if not self.metalearn_baseline:
                    inputs = ext.extract(
                        all_samples_data[step][i],
                        "observations",
                        "actions",
                        "advantages",
                        "expert_actions",
                    )
                    obs_list.append(inputs[0])
                    action_list.append(inputs[1])
                    adv_list.append(inputs[2])
                    expert_action_list.append(inputs[3])
                else:
                    inputs = ext.extract(all_samples_data[step][i],
                                         "observations", "actions", "rewards",
                                         "returns", "expert_actions", "paths")
                    obs_list.append(inputs[0])
                    action_list.append(inputs[1])
                    rewards_list.append(inputs[2])
                    returns_list.append(inputs[3])
                    expert_action_list.append(inputs[4])
                    # path_lengths_list.append([len(p['rewards']) for p in inputs[5]])
            if not self.metalearn_baseline:
                input_vals_list += obs_list + action_list + adv_list + expert_action_list
            else:
                input_vals_list += obs_list + action_list + rewards_list + returns_list + expert_action_list  #+ path_lengths_list before expert action list

        for step in [self.num_grad_updates]:  # last step
            obs_list, action_list, expert_action_list = [], [], [
            ]  # last step's adv_list not currently used in maml_il
            for i in range(self.meta_batch_size):  # for each task
                inputs = ext.extract(
                    all_samples_data[step][i],
                    "observations",
                    "actions",
                    "expert_actions",
                )
                obs_list.append(inputs[0])
                action_list.append(inputs[1])
                expert_action_list.append(inputs[2])

            input_vals_list += obs_list + action_list + expert_action_list

        # Code to compute the kl distance, kind of pointless on non-testing iterations as agent_infos are zeroed out on expert traj samples
        dist_info_list = []
        for i in range(self.meta_batch_size):
            # agent_infos = {x:all_samples_data[self.kl_constrain_step][i]['agent_infos'][x] for x in ['mean','log_std']}  ##kl_constrain_step default is -1, meaning post all alpha grad updates
            agent_infos = all_samples_data[self.kl_constrain_step][i][
                'agent_infos']  ##kl_constrain_step default is -1, meaning post all alpha grad updates
            dist_info_list += [
                agent_infos[k] for k in self.policy.distribution.dist_info_keys
            ]
        input_vals_list += tuple(
            dist_info_list)  # This populates old_dist_info_vars_list

        #  logger.log("Computing KL before")
        #  mean_kl_before = self.optimizer.constraint_val(input_vals_list)  # TODO: need to make sure the input list has the correct form. Maybe start naming the input lists based on what they're needed for

        logger.log("Computing loss before")
        # loss_before = self.optimizer.loss(input_vals_list)
        if itr not in TESTING_ITRS:
            steps = self.adam_curve[min(itr, len(self.adam_curve) - 1)]
            logger.log("Optimizing using %s Adam steps on itr %s" %
                       (steps, itr))
            start_loss = self.optimizer.optimize(input_vals_list, steps=steps)
            # self.optimizer.optimize(input_vals_list)
            return start_loss

        else:
            logger.log("Not Optimizing")
            logger.record_tabular("ILLoss", float('nan'))
            return None
示例#56
0
    def optimize_policy(self, itr, samples_data):
        # Init vars
        rewards = samples_data['rewards']
        actions = samples_data['actions']
        observations = samples_data['observations']

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        if self.policy.recurrent:
            recurrent_vals = [samples_data["valids"]]
        else:
            recurrent_vals = []
        # Compute sample Bellman error.
        feat_diff = []
        for path in samples_data['paths']:
            feats = self._features(path)
            feats = np.vstack([feats, np.zeros(feats.shape[1])])
            feat_diff.append(feats[1:] - feats[:-1])
        if self.policy.recurrent:
            max_path_length = max([len(path["advantages"]) for path in samples_data["paths"]])
            # pad feature diffs
            feat_diff = np.array([tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff])
        else:
            feat_diff = np.vstack(feat_diff)

        #################
        # Optimize dual #
        #################

        # Here we need to optimize dual through BFGS in order to obtain \eta
        # value. Initialize dual function g(\theta, v). \eta > 0
        # First eval delta_v
        f_dual = self.opt_info['f_dual']
        f_dual_grad = self.opt_info['f_dual_grad']

        # Set BFGS eval function
        def eval_dual(input):
            param_eta = input[0]
            param_v = input[1:]
            val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v]))
            return val.astype(np.float64)

        # Set BFGS gradient eval function
        def eval_dual_grad(input):
            param_eta = input[0]
            param_v = input[1:]
            grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v]))
            eta_grad = np.float(grad[0])
            v_grad = grad[1]
            return np.hstack([eta_grad, v_grad])

        # Initial BFGS parameter values.
        x0 = np.hstack([self.param_eta, self.param_v])

        # Set parameter boundaries: \eta>0, v unrestricted.
        bounds = [(-np.inf, np.inf) for _ in x0]
        bounds[0] = (0., np.inf)

        # Optimize through BFGS
        logger.log('optimizing dual')
        eta_before = x0[0]
        dual_before = eval_dual(x0)
        params_ast, _, _ = self.optimizer(
            func=eval_dual, x0=x0,
            fprime=eval_dual_grad,
            bounds=bounds,
            maxiter=self.max_opt_itr,
            disp=0
        )
        dual_after = eval_dual(params_ast)

        # Optimal values have been obtained
        self.param_eta = params_ast[0]
        self.param_v = params_ast[1:]

        ###################
        # Optimize policy #
        ###################
        cur_params = self.policy.get_param_values(trainable=True)
        f_loss = self.opt_info["f_loss"]
        f_loss_grad = self.opt_info['f_loss_grad']
        input = [rewards, observations, feat_diff,
                 actions] + state_info_list + recurrent_vals + [self.param_eta, self.param_v]

        # Set loss eval function
        def eval_loss(params):
            self.policy.set_param_values(params, trainable=True)
            val = f_loss(*input)
            return val.astype(np.float64)

        # Set loss gradient eval function
        def eval_loss_grad(params):
            self.policy.set_param_values(params, trainable=True)
            grad = f_loss_grad(*input)
            flattened_grad = tensor_utils.flatten_tensors(list(map(np.asarray, grad)))
            return flattened_grad.astype(np.float64)

        loss_before = eval_loss(cur_params)
        logger.log('optimizing policy')
        params_ast, _, _ = self.optimizer(
            func=eval_loss, x0=cur_params,
            fprime=eval_loss_grad,
            disp=0,
            maxiter=self.max_opt_itr
        )
        loss_after = eval_loss(params_ast)

        f_kl = self.opt_info['f_kl']

        mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype(
            np.float64)

        logger.log('eta %f -> %f' % (eta_before, self.param_eta))

        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)
        logger.record_tabular('DualBefore', dual_before)
        logger.record_tabular('DualAfter', dual_after)
        logger.record_tabular('MeanKL', mean_kl)
示例#57
0
    def process_samples(self,
                        itr,
                        paths,
                        prefix='',
                        log=True,
                        fast_process=False,
                        testitr=False,
                        metalearn_baseline=False):
        baselines = []
        returns = []
        if testitr:
            metalearn_baseline = False
        train_baseline = (itr in BASELINE_TRAINING_ITRS)
        if not fast_process:
            for idx, path in enumerate(paths):
                path["returns"] = special.discount_cumsum(
                    path["rewards"], self.algo.discount)
        if not fast_process and not metalearn_baseline:
            if log:
                logger.log("fitting baseline...")
            if hasattr(self.algo.baseline, 'fit_with_samples'):
                self.algo.baseline.fit_with_samples(
                    paths,
                    samples_data)  # TODO: doesn't seem like this is ever used
            else:
                # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1])
                # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))

                self.algo.baseline.fit(paths, log=log)
                # print("debug25 predloss AFTER  fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))
                # print("debug22 returns                ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1])
                # print("debug24 baseline after  fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1])
            if log:
                logger.log("fitted")

            if 'switch_to_init_dist' in dir(self.algo.baseline):
                self.algo.baseline.switch_to_init_dist()

            if train_baseline:
                self.algo.baseline.fit_train_baseline(paths)

            if hasattr(self.algo.baseline, "predict_n"):
                all_path_baselines = self.algo.baseline.predict_n(paths)
            else:
                all_path_baselines = [
                    self.algo.baseline.predict(path) for path in paths
                ]

        for idx, path in enumerate(paths):
            if not fast_process and not metalearn_baseline:
                # if idx==0:
                # print("debug22", all_path_baselines[idx])
                # print("debug23", path['returns'])

                path_baselines = np.append(all_path_baselines[idx], 0)
                deltas = path["rewards"] + \
                         self.algo.discount * path_baselines[1:] - \
                         path_baselines[:-1]
                path["advantages"] = special.discount_cumsum(
                    deltas, self.algo.discount * self.algo.gae_lambda)
                baselines.append(path_baselines[:-1])
            if not fast_process:
                returns.append(path["returns"])
            if "expert_actions" not in path.keys():
                if "expert_actions" in path["env_infos"].keys():
                    path["expert_actions"] = path["env_infos"][
                        "expert_actions"]
                else:
                    # assert False, "you shouldn't need expert_actions"
                    path["expert_actions"] = np.array(
                        [[None] * len(path['actions'][0])] *
                        len(path['actions']))

        if not fast_process and not metalearn_baseline:  # TODO: we want the ev eventually
            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))
            l2 = np.linalg.norm(np.array(baselines) - np.array(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])

            if not fast_process:
                rewards = tensor_utils.concat_tensor_list(
                    [path["rewards"] for path in paths])
                returns = tensor_utils.concat_tensor_list(
                    [path["returns"] for path in paths])

            if "env_infos" in paths[0].keys():
                env_infos = tensor_utils.concat_tensor_dict_list(
                    [path["env_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                advantages = tensor_utils.concat_tensor_list(
                    [path["advantages"] for path in paths])
                # print("debug, advantages are", advantages,)
                # print("debug, shape of advantages is", type(advantages), np.shape(advantages))

            expert_actions = tensor_utils.concat_tensor_list(
                [path["expert_actions"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                if self.algo.center_adv:
                    advantages = util.center_advantages(advantages)
                if self.algo.positive_adv:
                    advantages = util.shift_advantages_to_positive(advantages)
                if "meta_predict" in dir(self.algo.baseline):
                    # print("debug, advantages are", advantages, )
                    advantages = advantages + self.algo.baseline.meta_predict(
                        observations)
                    print("debug, metalearned baseline constant is",
                          self.algo.baseline.meta_predict(observations)[0:2],
                          "...",
                          self.algo.baseline.meta_predict(observations)[-3:-1])
                    # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations)))
                # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1])
                # print("debug, advantages shape is", np.shape(advantages))

            # average_discounted_return = \
            #     np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path.get("rewards", [0])) for path in paths
            ]

            # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))
            if fast_process:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
            elif metalearn_baseline:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list(
                        [path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig
            else:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    advantages=advantages,
                    env_infos=env_infos,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list(
                        [path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path.get("rewards", [0])) for path in paths
            ]

            # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )
        if log:
            # logger.record_tabular('Iteration', itr)
            # logger.record_tabular('AverageDiscountedReturn',
            #                      average_discounted_return)
            logger.record_tabular(prefix + 'AverageReturn',
                                  np.mean(undiscounted_returns))
            if testitr and prefix == "1":  # TODO make this functional for more than 1 iteration
                self.memory["AverageReturnLastTest"] = np.mean(
                    undiscounted_returns)
                self.memory["AverageReturnBestTest"] = max(
                    self.memory["AverageReturnLastTest"],
                    self.memory["AverageReturnBestTest"])
                if self.memory["AverageReturnBestTest"] == 0.0:
                    self.memory["AverageReturnBestTest"] = self.memory[
                        "AverageReturnLastTest"]
            if not fast_process and not metalearn_baseline:
                logger.record_tabular(prefix + 'ExplainedVariance', ev)
                logger.record_tabular(prefix + 'BaselinePredLoss', l2)

            logger.record_tabular(prefix + 'NumTrajs', len(paths))
            # logger.record_tabular(prefix + 'Entropy', ent)
            # logger.record_tabular(prefix + 'Perplexity', np.exp(ent))
            logger.record_tabular(prefix + 'StdReturn',
                                  np.std(undiscounted_returns))
            logger.record_tabular(prefix + 'MaxReturn',
                                  np.max(undiscounted_returns))
            logger.record_tabular(prefix + 'MinReturn',
                                  np.min(undiscounted_returns))
            if "env_infos" in paths[0].keys(
            ) and "success_left" in paths[0]["env_infos"].keys():
                logger.record_tabular(prefix + 'success_left',
                                      eval_success_left(paths))
                logger.record_tabular(prefix + 'success_right',
                                      eval_success_right(paths))
            # else:
            # logger.record_tabular(prefix + 'success_left', -1.0)
            # logger.record_tabular(prefix + 'success_right', -1.0)
        # if metalearn_baseline:
        #     if hasattr(self.algo.baseline, "revert"):
        #         self.algo.baseline.revert()

        return samples_data
示例#58
0
    def train(self):
        parallel_sampler.populate_task(self.env, self.policy)
        if self.plot:
            plotter.init_plot(self.env, self.policy)

        cur_std = self.init_std
        cur_mean = self.policy.get_param_values()
        # K = cur_mean.size
        n_best = max(1, int(self.n_samples * self.best_frac))

        for itr in range(self.n_itr):
            # sample around the current distribution
            extra_var_mult = max(1.0 - itr / self.extra_decay_time, 0)
            sample_std = np.sqrt(np.square(cur_std) + np.square(self.extra_std) * extra_var_mult)
            if self.batch_size is None:
                criterion = 'paths'
                threshold = self.n_samples
            else:
                criterion = 'samples'
                threshold = self.batch_size
            infos = stateful_pool.singleton_pool.run_collect(
                _worker_rollout_policy,
                threshold=threshold,
                args=(dict(cur_mean=cur_mean,
                          sample_std=sample_std,
                          max_path_length=self.max_path_length,
                          discount=self.discount,
                          criterion=criterion),)
            )
            xs = np.asarray([info[0] for info in infos])
            paths = [info[1] for info in infos]

            fs = np.array([path['returns'][0] for path in paths])
            print((xs.shape, fs.shape))
            best_inds = (-fs).argsort()[:n_best]
            best_xs = xs[best_inds]
            cur_mean = best_xs.mean(axis=0)
            cur_std = best_xs.std(axis=0)
            best_x = best_xs[0]
            logger.push_prefix('itr #%d | ' % itr)
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('CurStdMean', np.mean(cur_std))
            undiscounted_returns = np.array([path['undiscounted_return'] for path in paths])
            logger.record_tabular('AverageReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('StdReturn',
                                  np.mean(undiscounted_returns))
            logger.record_tabular('MaxReturn',
                                  np.max(undiscounted_returns))
            logger.record_tabular('MinReturn',
                                  np.min(undiscounted_returns))
            logger.record_tabular('AverageDiscountedReturn',
                                  np.mean(fs))
            logger.record_tabular('AvgTrajLen',
                                  np.mean([len(path['returns']) for path in paths]))
            logger.record_tabular('NumTrajs',
                                  len(paths))
            self.policy.set_param_values(best_x)
            self.env.log_diagnostics(paths)
            self.policy.log_diagnostics(paths)
            logger.save_itr_params(itr, dict(
                itr=itr,
                policy=self.policy,
                env=self.env,
                cur_mean=cur_mean,
                cur_std=cur_std,
            ))
            logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()
            if self.plot:
                plotter.update_plot(self.policy, self.max_path_length)
        parallel_sampler.terminate_task()
示例#59
0
 def log_diagnostics(self):
     super(SimpleSampler, self).log_diagnostics()
     logger.record_tabular('max-path-return', self._max_path_return)
     logger.record_tabular('last-path-return', self._last_path_return)
     logger.record_tabular('episodes', self._n_episodes)
     logger.record_tabular('total-samples', self._total_samples)