예제 #1
0
def init_experiment(env_name,
                    model_name,
                    env_hyper_params,
                    random_seed,
                    files_to_save,
                    config_to_save,
                    experiment_log_dir,
                    redirect_output=False,
                    use_wandb=False,
                    virtual_display=False,
                    experiment_name=None,
                    wandb_project_name=None):
    experiment_name = model_name + '-' + env_name if experiment_name is None else experiment_name
    wandb_project_name = env_name if wandb_project_name is None else wandb_project_name

    expr_manager = ExperimentManager(experiment_log_dir, experiment_name)
    if redirect_output:
        expr_manager.redirect_output_to_logfile_as_well()
    [expr_manager.make_copy_file(fname) for fname in files_to_save]
    # set up logging tool (wandb)
    expr_manager.use_wandb(use_wandb)
    expr_manager.set_wandb_project_name(
        wandb_project_name) if wandb_project_name else None
    expr_manager.init_wandb()
    expr_manager.save_config_wandb(config=config_to_save)

    if virtual_display:
        expr_manager.open_virtual_display()

    # create environment
    env = make_env(env_name, env_hyper_params)
    set_seed_everywhere(random_seed, env)

    return env, expr_manager
예제 #2
0
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        # initialise environment
        self.env = make_env(self.args.env_name,
                            self.args.max_rollouts_per_task,
                            seed=self.args.seed,
                            n_tasks=self.args.num_tasks)

        # unwrapped env to get some info about the environment
        unwrapped_env = self.env.unwrapped
        # split to train/eval tasks
        shuffled_tasks = np.random.permutation(
            unwrapped_env.get_all_task_idx())
        self.train_tasks = shuffled_tasks[:self.args.num_train_tasks]
        if self.args.num_eval_tasks > 0:
            self.eval_tasks = shuffled_tasks[-self.args.num_eval_tasks:]
        else:
            self.eval_tasks = []
        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = unwrapped_env._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task
        self.args.max_trajectory_len = args.max_trajectory_len

        # get action / observation dimensions
        if isinstance(self.env.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.env.action_space.shape[0]
        self.args.obs_dim = self.env.observation_space.shape[0]
        self.args.num_states = unwrapped_env.num_states if hasattr(
            unwrapped_env, 'num_states') else None
        self.args.act_space = self.env.action_space

        # initialize policy
        self.initialize_policy()
        # initialize buffer for RL updates
        self.policy_storage = MultiTaskPolicyStorage(
            max_replay_buffer_size=int(self.args.policy_buffer_size),
            obs_dim=self._get_augmented_obs_dim(),
            action_space=self.env.action_space,
            tasks=self.train_tasks,
            trajectory_len=args.max_trajectory_len,
        )
        self.current_experience_storage = None

        self.args.belief_reward = False  # initialize arg to not use belief rewards
예제 #3
0
def run():
    trained_model = get_model(env_name,
                              config_class=Efficient_DQN_Config,
                              model_path=model_path,
                              model_class=Dueling_DQN_Agent)
    env_config = Efficient_DQN_Config()
    env_config.get_agent_config()
    env = make_env(env_name, env_config.agent_config)
    info_displayer = InfoDisplayer(screen_height=150 * 6, screen_width=250, frame_time=0.05)
    eval_agent(env, trained_model, 5, verbose=True, render=True,
               info_displayer=info_displayer)
    env.close()
예제 #4
0
    def __init__(self, env_name, agent_config):
        self.agent_config = agent_config
        from environments.make_env import make_env
        self.env = make_env(env_name='MontezumaRevenge', agent_config=agent_config)
        self.spec = self.env.spec
        self.name = self.spec.id if self.spec is not None else 'MontezumaRevenge'
        self.ale = self.env.ale

        self.screen_width = agent_config.sys_args.frame_size
        self.screen_height = agent_config.sys_args.frame_size
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

        self.mode = "train"
        self.life_lost = False
        self.init_screen = self.get_screen_gray()

        """
        Sub-goal definitions
        """
        self.goal_meaning = ['lower right ladder', 'jump to the left of devil', 'key', 'lower left ladder',
                             'lower right ladder', 'central high platform', 'right door']
        self.n_subgoal = len(self.goal_meaning)
        self.goalSet = []
        # goal 0
        self.goalSet.append([[69, 68], [73, 71]])  # Lower Right Ladder. This is the box for detecting first subgoal
        # goal 2
        self.goalSet.append([[7, 41], [11, 45]])  # Key. This will be second sub goal
        # goal 3
        self.goalSet.append([[11, 68], [15, 71]])  # lower left ladder 3
        # goal 4
        self.goalSet.append([[69, 68], [73, 71]])  # Lower Right Ladder again, this will be the third subgoal
        # goal 6
        self.goalSet.append([[70, 20], [73, 35]])  # Right Door. This will be the 4th subgoal
        self.goalCenterLoc = []

        for goal in self.goalSet:
            goalCenter = [float(goal[0][0] + goal[1][0]) / 2, float(goal[0][1] + goal[1][1]) / 2]
            self.goalCenterLoc.append(goalCenter)

        self.agentOriginLoc = [42, 33]
        self.agentLastX = 42
        self.agentLastY = 33
        self.devilLastX = 0
        self.devilLastY = 0
        self.reached_goal = [0 for _ in range(self.n_subgoal)]

        self.stacked_state = self._get_init_stacked_state()
예제 #5
0
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        self.args, env = off_utl.expand_args(self.args, include_act_space=True)
        if self.args.act_space.__class__.__name__ == "Discrete":
            self.args.policy = 'dqn'
        else:
            self.args.policy = 'sac'

        # load buffers with data
        if 'load_data' not in self.args or self.args.load_data:
            goals, augmented_obs_dim = self.load_buffer(
                env)  # env is input just for possible relabelling option
            self.args.augmented_obs_dim = augmented_obs_dim
            self.goals = goals

        # initialize policy
        self.initialize_policy()

        # load vae for inference in evaluation
        self.load_vae()

        # create environment for evaluation
        self.env = make_env(
            args.env_name,
            args.max_rollouts_per_task,
            presampled_tasks=args.presampled_tasks,
            seed=args.seed,
        )
        # n_tasks=self.args.num_eval_tasks)
        if self.args.env_name == 'GridNavi-v2':
            self.env.unwrapped.goals = [
                tuple(goal.astype(int)) for goal in self.goals
            ]
예제 #6
0
def expand_args(args, include_act_space=False):
    # create env to get parameters
    env = make_env(args.env_name,
                   args.max_rollouts_per_task,
                   seed=args.seed,
                   n_tasks=1)

    if isinstance(env.action_space, gym.spaces.discrete.Discrete):
        args.action_dim = 1
    else:
        args.action_dim = env.action_space.shape[0]
    args.obs_dim = env.observation_space.shape[0]

    args.trajectory_len = env.unwrapped._max_episode_steps * args.max_rollouts_per_task
    args.num_states = env.unwrapped.num_states if hasattr(
        env.unwrapped, 'num_states') else None
    if include_act_space:
        args.act_space = env.action_space
    return args, env
예제 #7
0
def collect_rollout_per_policy(args, policy_dir, task):
    files_list = os.listdir(policy_dir)
    agent_path = os.path.join(policy_dir, sorted(files_list)[0])

    env = make_env(args.env_name,
                   args.max_rollouts_per_task,
                   seed=args.seed,
                   n_tasks=1,
                   modify_init_state_dist=args.modify_init_state_dist
                   if 'modify_init_state_dist' in args else False,
                   on_circle_init_state=args.on_circle_init_state
                   if 'on_circle_init_state' in args else True)
    unwrapped_env = env.unwrapped
    unwrapped_env.goals = np.array([task])

    if isinstance(env.action_space, gym.spaces.discrete.Discrete):
        args.action_dim = 1
    else:
        args.action_dim = env.action_space.shape[0]
    args.obs_dim = env.observation_space.shape[0]
    args.max_trajectory_len = unwrapped_env._max_episode_steps
    args.max_trajectory_len *= args.max_rollouts_per_task
    args.act_space = env.action_space

    agent = load_agent(args, agent_path)

    policy_storage = MultiTaskPolicyStorage(
        max_replay_buffer_size=int(args.policy_buffer_size),
        obs_dim=args.obs_dim,
        action_space=env.action_space,
        tasks=[0],
        trajectory_len=args.max_trajectory_len,
        num_reward_arrays=1,
        reward_types=[],
    )

    collect_rollouts_per_task(0, agent, policy_storage, env, args.num_rollouts)

    return policy_storage
예제 #8
0
def load_dataset(data_dir,
                 args,
                 num_tasks=None,
                 allow_dense_data_loading=True,
                 arr_type='tensor'):
    dataset = []
    env_dir = args.env_name.replace('Sparse', '') \
        if 'dense_train_sparse_test' in args and \
           args.dense_train_sparse_test is True and \
           allow_dense_data_loading \
        else args.env_name
    exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir)
    goals = []
    all_dirs = os.listdir(exps_dir)
    if num_tasks is None:
        tasks = np.random.permutation(len(all_dirs))
    else:
        tasks = np.random.choice(len(all_dirs), num_tasks)
    for i, task in enumerate(tasks):
        exp_dir = os.path.join(exps_dir, all_dirs[task])
        goals.append(extract_goal_from_path(all_dirs[task]))
        if 'rewards.npy' not in os.listdir(exp_dir):
            print('rewards.npy file doesn\'t exist. Creating it..')
            env = make_env(args.env_name,
                           args.max_rollouts_per_task,
                           n_tasks=1)
            create_rewards_arr(env, path=exp_dir)
            print('Created rewards.npy file.')
        obs, actions, rewards, next_obs, terminals = load_transitions(exp_dir)

        if obs.dim() < 3:
            obs = obs.reshape(-1, args.trajectory_len,
                              obs.shape[-1]).transpose(0, 1)
            actions = actions.reshape(-1, args.trajectory_len,
                                      actions.shape[-1]).transpose(0, 1)
            rewards = rewards.reshape(-1, args.trajectory_len,
                                      rewards.shape[-1]).transpose(0, 1)
            next_obs = next_obs.reshape(-1, args.trajectory_len,
                                        next_obs.shape[-1]).transpose(0, 1)
            terminals = terminals.reshape(-1, args.trajectory_len,
                                          terminals.shape[-1]).transpose(0, 1)
            if args.num_trajs_per_task is not None:
                obs = obs[:, :args.num_trajs_per_task, :]
                actions = actions[:, :args.num_trajs_per_task, :]
                rewards = rewards[:, :args.num_trajs_per_task, :]
                next_obs = next_obs[:, :args.num_trajs_per_task, :]
                terminals = terminals[:, :args.num_trajs_per_task, :]
        else:
            if args.num_trajs_per_task is not None:
                obs = obs[:, :args.num_trajs_per_task, :]
                actions = actions[:, :args.num_trajs_per_task, :]
                rewards = rewards[:, :args.num_trajs_per_task, :]
                next_obs = next_obs[:, :args.num_trajs_per_task, :]
                terminals = terminals[:, :args.num_trajs_per_task, :]
            obs = obs.transpose(0, 1).reshape(-1, obs.shape[-1])
            actions = actions.transpose(0, 1).reshape(-1, actions.shape[-1])
            rewards = rewards.transpose(0, 1).reshape(-1, rewards.shape[-1])
            next_obs = next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1])
            terminals = terminals.transpose(0,
                                            1).reshape(-1, terminals.shape[-1])

        if arr_type == 'numpy':
            obs = ptu.get_numpy(obs)
            actions = ptu.get_numpy(actions)
            rewards = ptu.get_numpy(rewards)
            next_obs = ptu.get_numpy(next_obs)
            terminals = ptu.get_numpy(terminals)

        dataset.append([obs, actions, rewards, next_obs, terminals])
        # print(exp_dir)
        # print('Obs shape: ' + str(np.shape(dataset[-1][0])) +
        #       '. Act shape: ' + str(np.shape(dataset[-1][1])) +
        #       '. Reward shape: ' + str(np.shape(dataset[-1][2])) +
        #       '. Next obs shape: ' + str(np.shape(dataset[-1][3])))
    print('{} experiments loaded.'.format(i + 1))
    goals = np.vstack(goals)

    return dataset, goals
예제 #9
0
def _borel(
        log_dir,
        pretrained_vae_dir,
        env_type,
        transform_data_bamdp,
        seed,
        path_length,
        meta_episode_len,
        relabelled_data_dir=None,
        offline_buffer_path_to_save_to=None,
        offline_buffer_path='',
        saved_tasks_path='',
        debug=False,
        vae_model_name=None,
        load_buffer_kwargs=None,
        gpu_id=0,
        **kwargs,
):
    if load_buffer_kwargs is None:
        load_buffer_kwargs = {}
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    parser = argparse.ArgumentParser()
    torch.autograd.set_detect_anomaly(True)

    if offline_buffer_path_to_save_to is None:
        offline_buffer_path_to_save_to = os.path.join(log_dir, 'transformed_data')

    # parser.add_argument('--env-type', default='gridworld')
    # parser.add_argument('--env-type', default='point_robot_sparse')
    # parser.add_argument('--env-type', default='cheetah_vel')
    parser.add_argument('--env-type', default=env_type)
    extra_args = []
    for k, v in kwargs.items():
        extra_args.append('--{}'.format(k))
        extra_args.append(str(v))
    args, rest_args = parser.parse_known_args(args=extra_args)
    args = env_name_to_args[env_type].get_args(rest_args)
    set_gpu_mode(torch.cuda.is_available() and args.use_gpu, gpu_id=gpu_id)

    if vae_model_name is None:
        vae_model_name = os.listdir(
            os.path.join(pretrained_vae_dir, args.env_name)
        )[0]

    vae_args = config_utl.load_config_file(os.path.join(pretrained_vae_dir, args.env_name,
                                                        vae_model_name, 'online_config.json'))
    args = config_utl.merge_configs(vae_args, args)     # order of input to this function is important
    # _, env = off_utl.expand_args(args)
    from environments.make_env import make_env
    task_data = joblib.load(saved_tasks_path)
    tasks = task_data['tasks']
    args.presampled_tasks = tasks
    env = make_env(args.env_name,
                   args.max_rollouts_per_task,
                   presampled_tasks=tasks,
                   seed=args.seed)#,
                   # n_tasks=1)

    args.vae_dir = pretrained_vae_dir
    args.data_dir = None
    args.vae_model_name = vae_model_name
    if transform_data_bamdp:
        # Transform data BAMDP (state relabelling)
        # load VAE for state relabelling
        print("performing state-relabeling")
        vae_models_path = os.path.join(pretrained_vae_dir, args.env_name,
                                       vae_model_name, 'models')
        vae = VAE(args)
        off_utl.load_trained_vae(vae, vae_models_path)
        # load data and relabel
        os.makedirs(offline_buffer_path_to_save_to, exist_ok=True)
        dataset, goals = off_utl.load_pearl_buffer(
            offline_buffer_path,
            tasks,
            add_done_info=env.add_done_info,
            path_length=path_length,
            meta_episode_len=meta_episode_len,
            **load_buffer_kwargs
        )
        dataset = [[x.astype(np.float32) for x in d] for d in dataset]
        bamdp_dataset = off_utl.transform_mdps_ds_to_bamdp_ds(dataset, vae, args)
        # save relabelled data
        print("saving state-relabeled data to ", offline_buffer_path_to_save_to)
        off_utl.save_dataset(offline_buffer_path_to_save_to, bamdp_dataset, goals)
        relabelled_data_dir = offline_buffer_path_to_save_to
    args.relabelled_data_dir = relabelled_data_dir
    args.max_rollouts_per_task = 3
    args.results_log_dir = log_dir

    if debug:
        print("DEBUG MODE ON")
        args.rl_updates_per_iter = 1
        args.log_interval = 1
    learner = OfflineMetaLearner(args)

    learner.train()
예제 #10
0
파일: learner.py 프로젝트: matants/OMRL_MER
    def __init__(self, args):
        """
        Seeds everything.
        Initialises: logger, environments, policy (+storage +optimiser).
        """

        self.args = args

        # make sure everything has the same seed
        utl.seed(self.args.seed)

        # initialise environment
        self.env = make_env(
            self.args.env_name,
            self.args.max_rollouts_per_task,
            seed=self.args.seed,
            n_tasks=1,
            modify_init_state_dist=self.args.modify_init_state_dist
            if 'modify_init_state_dist' in self.args else False,
            on_circle_init_state=self.args.on_circle_init_state
            if 'on_circle_init_state' in self.args else True)

        # saving buffer with task in name folder
        if hasattr(self.args, 'save_buffer') and self.args.save_buffer:
            env_dir = os.path.join(self.args.main_save_dir,
                                   '{}'.format(self.args.env_name))
            goal = self.env.unwrapped._goal
            self.output_dir = os.path.join(
                env_dir, self.args.save_dir,
                'seed_{}_'.format(self.args.seed) +
                off_utl.create_goal_path_ext_from_goal(goal))

        if self.args.save_models or self.args.save_buffer:
            os.makedirs(self.output_dir, exist_ok=True)
            config_utl.save_config_file(args, self.output_dir)

        # initialize tensorboard logger
        if self.args.log_tensorboard:
            self.tb_logger = TBLogger(self.args)

        # if not self.args.log_tensorboard:
        #     self.save_config_json_file()
        # unwrapped env to get some info about the environment
        unwrapped_env = self.env.unwrapped

        # calculate what the maximum length of the trajectories is
        args.max_trajectory_len = unwrapped_env._max_episode_steps
        args.max_trajectory_len *= self.args.max_rollouts_per_task
        self.args.max_trajectory_len = args.max_trajectory_len

        # get action / observation dimensions
        if isinstance(self.env.action_space, gym.spaces.discrete.Discrete):
            self.args.action_dim = 1
        else:
            self.args.action_dim = self.env.action_space.shape[0]
        self.args.obs_dim = self.env.observation_space.shape[0]
        self.args.num_states = unwrapped_env.num_states if hasattr(
            unwrapped_env, 'num_states') else None
        self.args.act_space = self.env.action_space

        # simulate env step to get reward types
        _, _, _, info = unwrapped_env.step(unwrapped_env.action_space.sample())
        reward_types = [
            reward_type for reward_type in list(info.keys())
            if reward_type.startswith('reward')
        ]

        # support dense rewards training (if exists)
        self.args.dense_train_sparse_test = self.args.dense_train_sparse_test \
            if 'dense_train_sparse_test' in self.args else False

        # initialize policy
        self.initialize_policy()
        # initialize buffer for RL updates
        self.policy_storage = MultiTaskPolicyStorage(
            max_replay_buffer_size=int(self.args.policy_buffer_size),
            obs_dim=self.args.obs_dim,
            action_space=self.env.action_space,
            tasks=[0],
            trajectory_len=args.max_trajectory_len,
            num_reward_arrays=len(reward_types)
            if reward_types and self.args.dense_train_sparse_test else 1,
            reward_types=reward_types,
        )

        self.args.belief_reward = False  # initialize arg to not use belief rewards