def enrich_expert_trajectories(self,
                                origin_folder,
                                goal_number,
                                destination_folder,
                                fake=False):
     trajs_for_goal = joblib.load(origin_folder + str(goal_number) + ".pkl")
     goal_distractions_and_shuffle_order = joblib.load(
         destination_folder + "goals_pool.pkl")['goals_pool'][goal_number]
     shuffle_order = goal_distractions_and_shuffle_order[-1].reshape((3, ))
     goal_and_distractions = np.concatenate(
         (goal_distractions_and_shuffle_order[int(shuffle_order[0])],
          goal_distractions_and_shuffle_order[int(shuffle_order[1])],
          goal_distractions_and_shuffle_order[int(shuffle_order[2])]))
     print("goals for index", goal_number, ":\n", goal_and_distractions)
     print("shuffle order for index", goal_number, ":\n", shuffle_order)
     new_trajs_for_goal = []
     for traj in trajs_for_goal:
         obs = traj['observations']
         new_obs = [
             np.concatenate((obs_step, goal_and_distractions.reshape(
                 (9, )))) for obs_step in obs
         ]
         new_traj = copy.deepcopy(traj)
         new_traj['observations'] = new_obs
         new_trajs_for_goal.append(new_traj)
     joblib_dump_safe(
         new_trajs_for_goal, destination_folder + str(goal_number) +
         ("" if not fake else "fake") + "dist.pkl")
    def _setup_goals(self, goals_pool_to_load, goals_pickle_to):
        if goals_pool_to_load is not None:
            # load goals
            logger.log("Loading goals pool from %s ..." % goals_pool_to_load)
            self.goals_pool = joblib.load(goals_pool_to_load)['goals_pool']
            self.goals_idxs_for_itr_dict = joblib.load(
                goals_pool_to_load)['idxs_dict']
        else:
            # build goals pool and idxs_dict
            goals_pool_size = (self.n_itr -
                               self.start_itr) * self.meta_batch_size
            logger.log("Sampling a pool of tasks/goals for this meta-batch...")
            env = self.env
            while 'sample_goals' not in dir(env):
                env = env.wrapped_env
            self.goals_pool = env.sample_goals(goals_pool_size)
            self.goals_idxs_for_itr_dict = {}
            for itr in range(self.start_itr, self.n_itr):
                self.goals_idxs_for_itr_dict[itr] = rd.sample(
                    range(goals_pool_size), self.meta_batch_size)

            # save goals pool
            if goals_pickle_to is not None:
                # logger.log("Saving goals to %s..." % goals_pickle_to)
                # joblib_dump_safe(self.goals_to_use_dict, goals_pickle_to)
                logger.log("Saving goals pool to %s..." % goals_pickle_to)
                joblib_dump_safe(
                    dict(goals_pool=self.goals_pool,
                         idxs_dict=self.goals_idxs_for_itr_dict),
                    goals_pickle_to)

        # inspect goals pool
        env = self.env
        while 'sample_goals' not in dir(env):
            env = env.wrapped_env
        reset_dimensions = env.sample_goals(1).shape[1:]
        dimensions = np.shape(
            self.goals_pool[self.goals_idxs_for_itr_dict[0][0]])
        assert reset_dimensions == dimensions, "loaded dimensions are %s, do not match with environment's %s" % (
            dimensions, reset_dimensions)

        # inspect goals_idxs_for_itr_dict
        assert set(range(self.start_itr, self.n_itr)).issubset(set(self.goals_idxs_for_itr_dict.keys())), \
            "Not all meta-iteration numbers have idx_dict in %s" % goals_pool_to_load
        for itr in range(self.start_itr, self.n_itr):
            num_goals = len(self.goals_idxs_for_itr_dict[itr])
            assert num_goals >= self.meta_batch_size, "iteration %s contained %s goals when at least %s are needed" % (
                itr, num_goals, self.meta_batch_size)
            self.goals_idxs_for_itr_dict[itr] = self.goals_idxs_for_itr_dict[
                itr][:self.meta_batch_size]

        # build goals_to_use_dict
        self.goals_to_use_dict = {}
        for itr in range(self.start_itr, self.n_itr):
            if itr not in self.testing_itrs or self.test_on_training_goals:
                self.goals_to_use_dict[itr] = np.array([
                    self.goals_pool[idx]
                    for idx in self.goals_idxs_for_itr_dict[itr]
                ])
 def attach_zeros_expert_trajectories(self,
                                      origin_folder,
                                      goal_number,
                                      destination_folder,
                                      extra_dim=0,
                                      suffix=""):
     trajs_for_goal = joblib.load(origin_folder + str(goal_number) +
                                  suffix + ".pkl")
     extra_input = np.array([0.] * extra_dim)
     new_trajs_for_goal = []
     for traj in trajs_for_goal:
         obs = traj['observations']
         new_obs = [
             np.concatenate((obs_step, extra_input)) for obs_step in obs
         ]
         new_traj = copy.deepcopy(traj)
         new_traj['observations'] = new_obs
         new_trajs_for_goal.append(new_traj)
     print(
         "doing", destination_folder + str(goal_number) + suffix + "_" +
         str(extra_dim) + ".pkl")
     joblib_dump_safe(
         new_trajs_for_goal, destination_folder + str(goal_number) +
         suffix + "_" + str(extra_dim) + ".pkl")
# moving the env_infos/img to observations
import numpy as np
import joblib
from rllab.sampler.utils import joblib_dump_safe

for goal in ["0"]:  #,"1","2"]:
    a = joblib.load(
        "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/raw/%s.pkl"
        % goal)
    for path in a:
        path['observations'] = path['env_infos']['img']
        path['env_infos'] = {}
    joblib_dump_safe(
        a,
        "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/%s.pkl"
        % goal)

# creating a dummy goals pool

import numpy as np
import joblib
from rllab.sampler.utils import joblib_dump_safe

gp = joblib.load(
    "/home/rosen/maml_rl/saved_expert_traj/R7DOF/R7-ET-vision-rgb_dummy/goals_pool.pkl"
)

gp_dummy = {}
gp_dummy['goals_pool'] = [0]
gp_dummy['idxs_dict'] = {}
Exemplo n.º 5
0
    def train(self):
        with tf.Session() as sess:
            if self.load_policy is not None:
                self.policy = joblib.load(self.load_policy)['policy']
            self.init_opt()
            # initialize uninitialized vars (I know, it's ugly)
            uninit_vars = []
            for var in tf.all_variables():
                try:
                    sess.run(var)
                except tf.errors.FailedPreconditionError:
                    uninit_vars.append(var)
            sess.run(tf.initialize_variables(uninit_vars))
            #sess.run(tf.initialize_all_variables())
            self.start_worker()
            start_time = time.time()
            for itr in range(self.start_itr, self.n_itr):
                if itr == self.n_itr - 1:
                    self.policy.std_modifier = 0.0001
                    self.policy.recompute_dist_for_adjusted_std()
                if itr in self.goals_for_ET_dict.keys():
                    # self.policy.std_modifier = 0.0001
                    # self.policy.recompute_dist_for_adjusted_std()
                    goals = self.goals_for_ET_dict[itr]
                    noise = self.action_noise_test
                    self.batch_size = self.batch_size_expert_traj
                else:
                    if self.reset_arg is None:
                        goals = [None]
                    else:
                        goals = [self.reset_arg]
                    noise = self.action_noise_train
                    self.batch_size = self.batch_size_train
                paths_to_save = {}
                itr_start_time = time.time()
                with logger.prefix('itr #%d | ' % itr):

                    logger.log("Obtaining samples...")
                    paths = []
                    for goalnum, goal in enumerate(goals):
                        preupdate = True if itr < self.n_itr - 1 else False
                        # paths_for_goal = self.obtain_samples(itr=itr, reset_args=[{'goal': goal, 'noise': noise}])  # when using oracle environments with changing noise, use this line!
                        paths_for_goal = self.obtain_samples(
                            itr=itr,
                            reset_args=[{
                                'goal': goal,
                                'noise': noise
                            }],
                            preupdate=preupdate)
                        print("debug, goal 1", goal)
                        paths.extend(
                            paths_for_goal
                        )  # we need this to be flat because we process all of them together
                        # TODO: there's a bunch of sample processing happening below that we should abstract away
                        if itr in self.expert_traj_itrs_to_pickle:
                            logger.log("Saving trajectories...")
                            paths_no_goalobs = self.clip_goal_from_obs(
                                paths_for_goal)
                            [
                                path.pop('agent_infos')
                                for path in paths_no_goalobs
                            ]
                            paths_to_save[goalnum] = paths_no_goalobs
                    if itr in self.expert_traj_itrs_to_pickle:
                        logger.log("Pickling trajectories...")
                        assert len(
                            paths_to_save.keys()
                        ) == 1, "we're going through ET goals one at a time now 10/24/17"
                        joblib_dump_safe(
                            paths_to_save[0],
                            self.save_expert_traj_dir + str(itr) + ".pkl")
                        logger.log("Fast-processing returns...")
                        undiscounted_returns = [
                            sum(path['rewards']) for path in paths
                        ]
                        print("debug", undiscounted_returns)
                        logger.record_tabular('AverageReturn',
                                              np.mean(undiscounted_returns))

                    else:
                        logger.log("Processing samples...")
                        samples_data = self.process_samples(itr, paths)
                        logger.log("Logging diagnostics...")
                        self.log_diagnostics(paths)
                        logger.log("Optimizing policy...")
                        self.optimize_policy(itr, samples_data)
                        #new_param_values = self.policy.get_variable_values(self.policy.all_params)
                        logger.log("Saving snapshot...")
                        params = self.get_itr_snapshot(
                            itr, samples_data)  # , **kwargs)
                        if self.store_paths:
                            params["paths"] = samples_data["paths"]

                    logger.save_itr_params(itr, params)
                    logger.log("Saved")
                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)

                    if True and (itr % 16 == 0
                                 ) and 7 < self.env.observation_space.shape[
                                     0] < 12:  # ReacherEnvOracleNoise
                        logger.log("Saving visualization of paths")
                        plt.clf()
                        plt.hold(True)

                        goal = paths[0]['observations'][0][-2:]
                        plt.plot(goal[0], goal[1], 'k*', markersize=10)

                        goal = paths[1]['observations'][0][-2:]
                        plt.plot(goal[0], goal[1], 'k*', markersize=10)

                        goal = paths[2]['observations'][0][-2:]
                        plt.plot(goal[0], goal[1], 'k*', markersize=10)

                        points = np.array(
                            [obs[6:8] for obs in paths[0]['observations']])
                        plt.plot(points[:, 0], points[:, 1], '-r', linewidth=2)

                        points = np.array(
                            [obs[6:8] for obs in paths[1]['observations']])
                        plt.plot(points[:, 0],
                                 points[:, 1],
                                 '--r',
                                 linewidth=2)

                        points = np.array(
                            [obs[6:8] for obs in paths[2]['observations']])
                        plt.plot(points[:, 0],
                                 points[:, 1],
                                 '-.r',
                                 linewidth=2)

                        plt.plot(0, 0, 'k.', markersize=5)
                        plt.xlim([-0.25, 0.25])
                        plt.ylim([-0.25, 0.25])
                        plt.legend(['path'])
                        plt.savefig(
                            osp.join(logger.get_snapshot_dir(), 'path' +
                                     str(0) + '_' + str(itr) + '.png'))
                        print(
                            osp.join(logger.get_snapshot_dir(), 'path' +
                                     str(0) + '_' + str(itr) + '.png'))

                    # if self.make_video and itr % 2 == 0 or itr in [0,1,2,3,4,5,6,7,8]: # and itr in self.goals_for_ET_dict.keys() == 0:
                    if self.make_video and (
                            itr >= 0 and itr <= self.n_itr - 1
                    ):  # and itr in self.goals_for_ET_dict.keys() == 0:
                        logger.log("Saving videos...")
                        self.env.reset(reset_args=goals[0])
                        video_filename = osp.join(
                            logger.get_snapshot_dir(),
                            'post_path_%s_0_%s.gif' %
                            (itr, time.strftime("%H%M%S")))
                        rollout(
                            env=self.env,
                            agent=self.policy,
                            max_path_length=self.max_path_length,
                            animated=True,
                            speedup=2,
                            save_video=True,
                            video_filename=video_filename,
                            reset_arg=goals[0],
                            use_maml=False,
                        )
                        # self.env.reset(reset_args=goals[0])
                        # video_filename = osp.join(logger.get_snapshot_dir(), 'post_path_%s_1_%s.gif' % (itr,time.strftime("%H%M%S")))
                        # rollout(env=self.env, agent=self.policy, max_path_length=self.max_path_length,
                        #         animated=True, speedup=2, save_video=True, video_filename=video_filename,
                        #         reset_arg=goals[0],
                        #         use_maml=False, )
                        # self.env.reset(reset_args=goals[0])
                        # video_filename = osp.join(logger.get_snapshot_dir(), 'post_path_%s_2_%s.gif' % (itr,time.strftime("%H%M%S")))
                        # rollout(env=self.env, agent=self.policy, max_path_length=self.max_path_length,
                        #         animated=True, speedup=2, save_video=True, video_filename=video_filename,
                        #         reset_arg=goals[0],
                        #         use_maml=False, )

                    # debugging
                    """
                    if itr % 1 == 0:
                        logger.log("Saving visualization of paths")
                        import matplotlib.pyplot as plt;
                        for ind in range(5):
                            plt.clf(); plt.hold(True)
                            points = paths[ind]['observations']
                            plt.plot(points[:,0], points[:,1], '-r', linewidth=2)
                            plt.xlim([-1.0, 1.0])
                            plt.ylim([-1.0, 1.0])
                            plt.legend(['path'])
                            plt.savefig('/home/cfinn/path'+str(ind)+'.png')
                    """
                    # end debugging

                    logger.dump_tabular(with_prefix=False)
                    if self.plot:
                        self.update_plot()
                        if self.pause_for_plot:
                            input("Plotting evaluation run: Press Enter to "
                                  "continue...")

        import ipdb
        ipdb.set_trace()
        brace = 'a'
        self.shutdown_worker()
Exemplo n.º 6
0
    def __init__(
            self,
            env,
            policy,
            baseline,
            metalearn_baseline=False,
            scope=None,
            n_itr=500,
            start_itr=0,
            # Note that the number of trajectories for grad update = batch_size
            # Defaults are 10 trajectories of length 500 for gradient update
            # If default is 10 traj-s, why batch_size=100?
            batch_size=100,
            max_path_length=500,
            meta_batch_size=100,
            num_grad_updates=1,
            discount=0.99,
            gae_lambda=1,
            beta_steps=1,
            beta_curve=None,
            plot=False,
            pause_for_plot=False,
            make_video=False,
            center_adv=True,
            positive_adv=False,
            store_paths=False,
            whole_paths=True,
            fixed_horizon=False,
            sampler_cls=None,
            sampler_args=None,
            force_batch_sampler=False,
            use_maml=True,
            use_maml_il=False,
            test_on_training_goals=False,
            limit_demos_num=None,
            test_goals_mult=1,
            load_policy=None,
            pre_std_modifier=1.0,
            post_std_modifier_train=1.0,
            post_std_modifier_test=1.0,
            goals_to_load=None,
            goals_pool_to_load=None,
            expert_trajs_dir=None,
            expert_trajs_suffix="",
            goals_pickle_to=None,
            goals_pool_size=None,
            use_pooled_goals=True,
            extra_input=None,
            extra_input_dim=0,
            seed=1,
            **kwargs):
        """
        :param env: Environment
        :param policy: Policy
        :type policy: Policy
        :param baseline: Baseline
        :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms
        simultaneously, each using different environments and policies
        :param n_itr: Number of iterations.
        :param start_itr: Starting iteration.
        :param batch_size: Number of samples per iteration.  #
        :param max_path_length: Maximum length of a single rollout.
        :param meta_batch_size: Number of tasks sampled per meta-update
        :param num_grad_updates: Number of fast gradient updates
        :param discount: Discount.
        :param gae_lambda: Lambda used for generalized advantage estimation.
        :param plot: Plot evaluation run after each iteration.
        :param pause_for_plot: Whether to pause before contiuing when plotting.
        :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1.
        :param positive_adv: Whether to shift the advantages so that they are always positive. When used in
        conjunction with center_adv the advantages will be standardized before shifting.
        :param store_paths: Whether to save all paths data to the snapshot.
        :return:
        """
        self.seed = seed
        self.env = env
        self.policy = policy
        self.load_policy = load_policy
        self.baseline = baseline
        self.metalearn_baseline = metalearn_baseline
        self.scope = scope
        self.n_itr = n_itr
        self.start_itr = start_itr
        # batch_size is the number of trajectories for one fast grad update.
        # self.batch_size is the number of total transitions to collect.
        self.batch_size = batch_size * max_path_length * meta_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.beta_steps = beta_steps
        self.beta_curve = beta_curve if beta_curve is not None else [
            self.beta_steps
        ]
        self.old_il_loss = None
        self.plot = plot
        self.pause_for_plot = pause_for_plot
        self.make_video = make_video
        self.center_adv = center_adv
        self.positive_adv = positive_adv
        self.store_paths = store_paths
        self.whole_paths = whole_paths
        self.fixed_horizon = fixed_horizon
        self.meta_batch_size = meta_batch_size  # number of tasks
        self.num_grad_updates = num_grad_updates  # number of gradient steps during training
        self.use_maml_il = use_maml_il
        self.test_on_training_goals = test_on_training_goals
        self.testing_itrs = TESTING_ITRS
        if self.metalearn_baseline:
            self.testing_itrs.insert(0, 0)
        print("test_on_training_goals", self.test_on_training_goals)
        self.limit_demos_num = limit_demos_num
        self.test_goals_mult = test_goals_mult
        self.pre_std_modifier = pre_std_modifier
        self.post_std_modifier_train = post_std_modifier_train
        self.post_std_modifier_test = post_std_modifier_test
        #   self.action_limiter_multiplier = action_limiter_multiplier
        self.expert_trajs_dir = expert_trajs_dir
        self.expert_trajs_suffix = expert_trajs_suffix
        self.use_pooled_goals = use_pooled_goals
        self.extra_input = extra_input
        self.extra_input_dim = extra_input_dim
        # Next, we will set up the goals and potentially trajectories that we plan to use.
        # If we use trajectorie
        assert goals_to_load is None, "deprecated"
        if self.use_pooled_goals:
            if expert_trajs_dir is not None:
                assert goals_pool_to_load is None, "expert_trajs already comes with its own goals, please disable goals_pool_to_load"
                goals_pool = joblib.load(self.expert_trajs_dir +
                                         "goals_pool.pkl")
                self.goals_pool = goals_pool['goals_pool']
                self.goals_idxs_for_itr_dict = goals_pool['idxs_dict']
                if "demos_path" in goals_pool.keys():
                    self.demos_path = goals_pool["demos_path"]
                else:
                    self.demos_path = expert_trajs_dir
                print("successfully extracted goals pool",
                      self.goals_idxs_for_itr_dict.keys())
            elif goals_pool_to_load is not None:
                logger.log("Loading goals pool from %s ..." %
                           goals_pool_to_load)
                self.goals_pool = joblib.load(goals_pool_to_load)['goals_pool']
                self.goals_idxs_for_itr_dict = joblib.load(
                    goals_pool_to_load)['idxs_dict']
            else:
                # we build our own goals pool and idxs_dict
                if goals_pool_size is None:
                    self.goals_pool_size = (
                        self.n_itr - self.start_itr) * self.meta_batch_size
                else:
                    self.goals_pool_size = goals_pool_size

                logger.log(
                    "Sampling a pool of tasks/goals for this meta-batch...")
                env = self.env
                while 'sample_goals' not in dir(env):
                    env = env.wrapped_env
                self.goals_pool = env.sample_goals(self.goals_pool_size)
                self.goals_idxs_for_itr_dict = {}
                for itr in range(self.start_itr, self.n_itr):
                    self.goals_idxs_for_itr_dict[itr] = rd.sample(
                        range(self.goals_pool_size), self.meta_batch_size)

            # inspecting the goals pool
            env = self.env
            while 'sample_goals' not in dir(env):
                env = env.wrapped_env
            reset_dimensions = env.sample_goals(1).shape[1:]
            dimensions = np.shape(
                self.goals_pool[self.goals_idxs_for_itr_dict[0][0]])
            assert reset_dimensions == dimensions, "loaded dimensions are %s, do not match with environment's %s" % (
                dimensions, reset_dimensions)
            # inspecting goals_idxs_for_itr_dict
            assert set(range(self.start_itr, self.n_itr)).issubset(set(self.goals_idxs_for_itr_dict.keys())), \
                "Not all meta-iteration numbers have idx_dict in %s" % goals_pool_to_load
            for itr in range(self.start_itr, self.n_itr):
                num_goals = len(self.goals_idxs_for_itr_dict[itr])
                assert num_goals >= self.meta_batch_size, "iteration %s contained %s goals when at least %s are needed" % (
                    itr, num_goals, self.meta_batch_size)
                self.goals_idxs_for_itr_dict[
                    itr] = self.goals_idxs_for_itr_dict[itr][:self.
                                                             meta_batch_size]

            # we build goals_to_use_dict regardless of how we obtained goals_pool, goals_idx_for_itr_dict
            self.goals_to_use_dict = {}
            for itr in range(self.start_itr, self.n_itr):
                if itr not in self.testing_itrs:
                    self.goals_to_use_dict[itr] = np.array([
                        self.goals_pool[idx]
                        for idx in self.goals_idxs_for_itr_dict[itr]
                    ])

        else:  # backwards compatibility code for old-format ETs
            self.goals_to_use_dict = joblib.load(self.expert_trajs_dir +
                                                 "goals.pkl")

            assert set(range(self.start_itr, self.n_itr)).issubset(
                set(self.goals_to_use_dict.keys())
            ), "Not all meta-iteration numbers have saved goals in %s" % expert_trajs_dir
            # chopping off unnecessary meta-iterations and goals
            self.goals_to_use_dict = {
                itr: self.goals_to_use_dict[itr][:self.meta_batch_size]
                for itr in range(self.start_itr, self.n_itr)
            }
        # saving goals pool
        if goals_pickle_to is not None:
            # logger.log("Saving goals to %s..." % goals_pickle_to)
            # joblib_dump_safe(self.goals_to_use_dict, goals_pickle_to)
            logger.log("Saving goals pool to %s..." % goals_pickle_to)
            joblib_dump_safe(
                dict(goals_pool=self.goals_pool,
                     idxs_dict=self.goals_idxs_for_itr_dict), goals_pickle_to)

        if sampler_cls is None:
            if singleton_pool.n_parallel > 1:
                sampler_cls = BatchSampler
                print("Using Batch Sampler")
            else:
                sampler_cls = VectorizedSampler
                print("Using Vectorized Sampler")
        if sampler_args is None:
            sampler_args = dict()
        if 'n_envs' not in sampler_args.keys():
            sampler_args['n_envs'] = self.meta_batch_size
        self.sampler = sampler_cls(self, **sampler_args)