Exemplo n.º 1
0
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        # initialise environment
        self.env = make_env(self.args.env_name,
                            self.args.max_rollouts_per_task,
                            seed=self.args.seed,
                            n_tasks=self.args.num_tasks)

        # unwrapped env to get some info about the environment
        unwrapped_env = self.env.unwrapped
        # split to train/eval tasks
        shuffled_tasks = np.random.permutation(
            unwrapped_env.get_all_task_idx())
        self.train_tasks = shuffled_tasks[:self.args.num_train_tasks]
        if self.args.num_eval_tasks > 0:
            self.eval_tasks = shuffled_tasks[-self.args.num_eval_tasks:]
        else:
            self.eval_tasks = []
        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = unwrapped_env._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task
        self.args.max_trajectory_len = args.max_trajectory_len

        # get action / observation dimensions
        if isinstance(self.env.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.env.action_space.shape[0]
        self.args.obs_dim = self.env.observation_space.shape[0]
        self.args.num_states = unwrapped_env.num_states if hasattr(
            unwrapped_env, 'num_states') else None
        self.args.act_space = self.env.action_space

        # initialize policy
        self.initialize_policy()
        # initialize buffer for RL updates
        self.policy_storage = MultiTaskPolicyStorage(
            max_replay_buffer_size=int(self.args.policy_buffer_size),
            obs_dim=self._get_augmented_obs_dim(),
            action_space=self.env.action_space,
            tasks=self.train_tasks,
            trajectory_len=args.max_trajectory_len,
        )
        self.current_experience_storage = None

        self.args.belief_reward = False  # initialize arg to not use belief rewards
Exemplo n.º 2
0
    def __init__(self, args):

        self.args = args
        utl.seed(self.args.seed, self.args.deterministic_execution)

        # calculate number of updates and keep count of frames/iterations
        self.num_updates = int(
            args.num_frames) // args.policy_num_steps // args.num_processes
        self.frames = 0
        self.iter_idx = 0

        # initialise tensorboard logger
        self.logger = TBLogger(self.args, self.args.exp_label)

        # initialise environments
        self.envs = make_vec_envs(
            env_name=args.env_name,
            seed=args.seed,
            num_processes=args.num_processes,
            gamma=args.policy_gamma,
            device=device,
            episodes_per_task=self.args.max_rollouts_per_task,
            normalise_rew=args.norm_rew_for_policy,
            ret_rms=None,
        )

        # calculate what the maximum length of the trajectories is
        self.args.max_trajectory_len = self.envs._max_episode_steps
        self.args.max_trajectory_len *= self.args.max_rollouts_per_task

        # get policy input dimensions
        self.args.state_dim = self.envs.observation_space.shape[0]
        self.args.task_dim = self.envs.task_dim
        self.args.belief_dim = self.envs.belief_dim
        self.args.num_states = self.envs.num_states
        # get policy output (action) dimensions
        self.args.action_space = self.envs.action_space
        if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        elif isinstance(self.envs.action_space,
                        gym.spaces.multi_discrete.MultiDiscrete):
            self.args.action_dim = self.envs.action_space.nvec[0]
        else:
            self.args.action_dim = self.envs.action_space.shape[0]

        # initialise VAE and policy
        self.vae = VaribadVAE(self.args, self.logger, lambda: self.iter_idx)
        self.policy_storage = self.initialise_policy_storage()
        self.policy = self.initialise_policy()
Exemplo n.º 3
0
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        self.args, env = off_utl.expand_args(self.args, include_act_space=True)
        if self.args.act_space.__class__.__name__ == "Discrete":
            self.args.policy = 'dqn'
        else:
            self.args.policy = 'sac'

        # load buffers with data
        if 'load_data' not in self.args or self.args.load_data:
            goals, augmented_obs_dim = self.load_buffer(
                env)  # env is input just for possible relabelling option
            self.args.augmented_obs_dim = augmented_obs_dim
            self.goals = goals

        # initialize policy
        self.initialize_policy()

        # load vae for inference in evaluation
        self.load_vae()

        # create environment for evaluation
        self.env = make_env(
            args.env_name,
            args.max_rollouts_per_task,
            presampled_tasks=args.presampled_tasks,
            seed=args.seed,
        )
        # n_tasks=self.args.num_eval_tasks)
        if self.args.env_name == 'GridNavi-v2':
            self.env.unwrapped.goals = [
                tuple(goal.astype(int)) for goal in self.goals
            ]
Exemplo n.º 4
0
    def __init__(self, args):
        self.args = args
        utl.seed(self.args.seed, self.args.deterministic_execution)

        # count number of frames and number of meta-iterations
        self.frames = 0
        self.iter_idx = 0

        # initialise tensorboard logger
        self.logger = TBLogger(self.args, self.args.exp_label)

        # initialise environments
        self.envs = make_vec_envs(
            env_name=args.env_name,
            seed=args.seed,
            num_processes=args.num_processes,
            gamma=args.policy_gamma,
            log_dir=args.agent_log_dir,
            device=device,
            allow_early_resets=False,
            episodes_per_task=self.args.max_rollouts_per_task,
            obs_rms=None,
            ret_rms=None,
        )

        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = self.envs._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task

        # calculate number of meta updates
        self.args.num_updates = int(
            args.num_frames) // args.policy_num_steps // args.num_processes

        # get action / observation dimensions
        if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.envs.action_space.shape[0]
        self.args.obs_dim = self.envs.observation_space.shape[0]
        self.args.num_states = self.envs.num_states if str.startswith(
            self.args.env_name, 'Grid') else None
        self.args.act_space = self.envs.action_space

        self.vae = VaribadVAE(self.args, self.logger, lambda: self.iter_idx)

        self.initialise_policy()
Exemplo n.º 5
0
    def __init__(self, args):

        self.args = args
        utl.seed(self.args.seed, self.args.deterministic_execution)

        # calculate number of updates and keep count of frames/iterations
        self.num_updates = int(
            args.num_frames) // args.policy_num_steps // args.num_processes
        self.frames = 0
        self.iter_idx = -1

        # initialise tensorboard logger
        self.logger = TBLogger(self.args, self.args.exp_label)

        # initialise environments
        self.envs = make_vec_envs(
            env_name=args.env_name,
            seed=args.seed,
            num_processes=args.num_processes,
            gamma=args.policy_gamma,
            device=device,
            episodes_per_task=self.args.max_rollouts_per_task,
            normalise_rew=args.norm_rew_for_policy,
            ret_rms=None,
            tasks=None)

        if self.args.single_task_mode:
            # get the current tasks (which will be num_process many different tasks)
            self.train_tasks = self.envs.get_task()
            # set the tasks to the first task (i.e. just a random task)
            self.train_tasks[1:] = self.train_tasks[0]
            # make it a list
            self.train_tasks = [t for t in self.train_tasks]
            # re-initialise environments with those tasks
            self.envs = make_vec_envs(
                env_name=args.env_name,
                seed=args.seed,
                num_processes=args.num_processes,
                gamma=args.policy_gamma,
                device=device,
                episodes_per_task=self.args.max_rollouts_per_task,
                normalise_rew=args.norm_rew_for_policy,
                ret_rms=None,
                tasks=self.train_tasks,
            )
            # save the training tasks so we can evaluate on the same envs later
            utl.save_obj(self.train_tasks, self.logger.full_output_folder,
                         "train_tasks")
        else:
            self.train_tasks = None

        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = self.envs._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task

        # get policy input dimensions
        self.args.state_dim = self.envs.observation_space.shape[0]
        self.args.task_dim = self.envs.task_dim
        self.args.belief_dim = self.envs.belief_dim
        self.args.num_states = self.envs.num_states
        # get policy output (action) dimensions
        self.args.action_space = self.envs.action_space
        if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.envs.action_space.shape[0]

        # initialise policy
        self.policy_storage = self.initialise_policy_storage()
        self.policy = self.initialise_policy()
Exemplo n.º 6
0
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialise environment
        self.env = make_env(
            self.args.env_name,
            self.args.max_rollouts_per_task,
            seed=self.args.seed,
            n_tasks=1,
            modify_init_state_dist=self.args.modify_init_state_dist
            if 'modify_init_state_dist' in self.args else False,
            on_circle_init_state=self.args.on_circle_init_state
            if 'on_circle_init_state' in self.args else True)

        # saving buffer with task in name folder
        if hasattr(self.args, 'save_buffer') and self.args.save_buffer:
            env_dir = os.path.join(self.args.main_save_dir,
                                   '{}'.format(self.args.env_name))
            goal = self.env.unwrapped._goal
            self.output_dir = os.path.join(
                env_dir, self.args.save_dir,
                'seed_{}_'.format(self.args.seed) +
                off_utl.create_goal_path_ext_from_goal(goal))

        if self.args.save_models or self.args.save_buffer:
            os.makedirs(self.output_dir, exist_ok=True)
            config_utl.save_config_file(args, self.output_dir)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        # if not self.args.log_tensorboard:
        #     self.save_config_json_file()
        # unwrapped env to get some info about the environment
        unwrapped_env = self.env.unwrapped

        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = unwrapped_env._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task
        self.args.max_trajectory_len = args.max_trajectory_len

        # get action / observation dimensions
        if isinstance(self.env.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.env.action_space.shape[0]
        self.args.obs_dim = self.env.observation_space.shape[0]
        self.args.num_states = unwrapped_env.num_states if hasattr(
            unwrapped_env, 'num_states') else None
        self.args.act_space = self.env.action_space

        # simulate env step to get reward types
        _, _, _, info = unwrapped_env.step(unwrapped_env.action_space.sample())
        reward_types = [
            reward_type for reward_type in list(info.keys())
            if reward_type.startswith('reward')
        ]

        # support dense rewards training (if exists)
        self.args.dense_train_sparse_test = self.args.dense_train_sparse_test \
            if 'dense_train_sparse_test' in self.args else False

        # initialize policy
        self.initialize_policy()
        # initialize buffer for RL updates
        self.policy_storage = MultiTaskPolicyStorage(
            max_replay_buffer_size=int(self.args.policy_buffer_size),
            obs_dim=self.args.obs_dim,
            action_space=self.env.action_space,
            tasks=[0],
            trajectory_len=args.max_trajectory_len,
            num_reward_arrays=len(reward_types)
            if reward_types and self.args.dense_train_sparse_test else 1,
            reward_types=reward_types,
        )

        self.args.belief_reward = False  # initialize arg to not use belief rewards