def init_experiment(env_name, model_name, env_hyper_params, random_seed, files_to_save, config_to_save, experiment_log_dir, redirect_output=False, use_wandb=False, virtual_display=False, experiment_name=None, wandb_project_name=None): experiment_name = model_name + '-' + env_name if experiment_name is None else experiment_name wandb_project_name = env_name if wandb_project_name is None else wandb_project_name expr_manager = ExperimentManager(experiment_log_dir, experiment_name) if redirect_output: expr_manager.redirect_output_to_logfile_as_well() [expr_manager.make_copy_file(fname) for fname in files_to_save] # set up logging tool (wandb) expr_manager.use_wandb(use_wandb) expr_manager.set_wandb_project_name( wandb_project_name) if wandb_project_name else None expr_manager.init_wandb() expr_manager.save_config_wandb(config=config_to_save) if virtual_display: expr_manager.open_virtual_display() # create environment env = make_env(env_name, env_hyper_params) set_seed_everywhere(random_seed, env) return env, expr_manager
def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) # initialise environment self.env = make_env(self.args.env_name, self.args.max_rollouts_per_task, seed=self.args.seed, n_tasks=self.args.num_tasks) # unwrapped env to get some info about the environment unwrapped_env = self.env.unwrapped # split to train/eval tasks shuffled_tasks = np.random.permutation( unwrapped_env.get_all_task_idx()) self.train_tasks = shuffled_tasks[:self.args.num_train_tasks] if self.args.num_eval_tasks > 0: self.eval_tasks = shuffled_tasks[-self.args.num_eval_tasks:] else: self.eval_tasks = [] # calculate what the maximum length of the trajectories is args.max_trajectory_len = unwrapped_env._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task self.args.max_trajectory_len = args.max_trajectory_len # get action / observation dimensions if isinstance(self.env.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.env.action_space.shape[0] self.args.obs_dim = self.env.observation_space.shape[0] self.args.num_states = unwrapped_env.num_states if hasattr( unwrapped_env, 'num_states') else None self.args.act_space = self.env.action_space # initialize policy self.initialize_policy() # initialize buffer for RL updates self.policy_storage = MultiTaskPolicyStorage( max_replay_buffer_size=int(self.args.policy_buffer_size), obs_dim=self._get_augmented_obs_dim(), action_space=self.env.action_space, tasks=self.train_tasks, trajectory_len=args.max_trajectory_len, ) self.current_experience_storage = None self.args.belief_reward = False # initialize arg to not use belief rewards
def run(): trained_model = get_model(env_name, config_class=Efficient_DQN_Config, model_path=model_path, model_class=Dueling_DQN_Agent) env_config = Efficient_DQN_Config() env_config.get_agent_config() env = make_env(env_name, env_config.agent_config) info_displayer = InfoDisplayer(screen_height=150 * 6, screen_width=250, frame_time=0.05) eval_agent(env, trained_model, 5, verbose=True, render=True, info_displayer=info_displayer) env.close()
def __init__(self, env_name, agent_config): self.agent_config = agent_config from environments.make_env import make_env self.env = make_env(env_name='MontezumaRevenge', agent_config=agent_config) self.spec = self.env.spec self.name = self.spec.id if self.spec is not None else 'MontezumaRevenge' self.ale = self.env.ale self.screen_width = agent_config.sys_args.frame_size self.screen_height = agent_config.sys_args.frame_size self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.mode = "train" self.life_lost = False self.init_screen = self.get_screen_gray() """ Sub-goal definitions """ self.goal_meaning = ['lower right ladder', 'jump to the left of devil', 'key', 'lower left ladder', 'lower right ladder', 'central high platform', 'right door'] self.n_subgoal = len(self.goal_meaning) self.goalSet = [] # goal 0 self.goalSet.append([[69, 68], [73, 71]]) # Lower Right Ladder. This is the box for detecting first subgoal # goal 2 self.goalSet.append([[7, 41], [11, 45]]) # Key. This will be second sub goal # goal 3 self.goalSet.append([[11, 68], [15, 71]]) # lower left ladder 3 # goal 4 self.goalSet.append([[69, 68], [73, 71]]) # Lower Right Ladder again, this will be the third subgoal # goal 6 self.goalSet.append([[70, 20], [73, 35]]) # Right Door. This will be the 4th subgoal self.goalCenterLoc = [] for goal in self.goalSet: goalCenter = [float(goal[0][0] + goal[1][0]) / 2, float(goal[0][1] + goal[1][1]) / 2] self.goalCenterLoc.append(goalCenter) self.agentOriginLoc = [42, 33] self.agentLastX = 42 self.agentLastY = 33 self.devilLastX = 0 self.devilLastY = 0 self.reached_goal = [0 for _ in range(self.n_subgoal)] self.stacked_state = self._get_init_stacked_state()
def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) self.args, env = off_utl.expand_args(self.args, include_act_space=True) if self.args.act_space.__class__.__name__ == "Discrete": self.args.policy = 'dqn' else: self.args.policy = 'sac' # load buffers with data if 'load_data' not in self.args or self.args.load_data: goals, augmented_obs_dim = self.load_buffer( env) # env is input just for possible relabelling option self.args.augmented_obs_dim = augmented_obs_dim self.goals = goals # initialize policy self.initialize_policy() # load vae for inference in evaluation self.load_vae() # create environment for evaluation self.env = make_env( args.env_name, args.max_rollouts_per_task, presampled_tasks=args.presampled_tasks, seed=args.seed, ) # n_tasks=self.args.num_eval_tasks) if self.args.env_name == 'GridNavi-v2': self.env.unwrapped.goals = [ tuple(goal.astype(int)) for goal in self.goals ]
def expand_args(args, include_act_space=False): # create env to get parameters env = make_env(args.env_name, args.max_rollouts_per_task, seed=args.seed, n_tasks=1) if isinstance(env.action_space, gym.spaces.discrete.Discrete): args.action_dim = 1 else: args.action_dim = env.action_space.shape[0] args.obs_dim = env.observation_space.shape[0] args.trajectory_len = env.unwrapped._max_episode_steps * args.max_rollouts_per_task args.num_states = env.unwrapped.num_states if hasattr( env.unwrapped, 'num_states') else None if include_act_space: args.act_space = env.action_space return args, env
def collect_rollout_per_policy(args, policy_dir, task): files_list = os.listdir(policy_dir) agent_path = os.path.join(policy_dir, sorted(files_list)[0]) env = make_env(args.env_name, args.max_rollouts_per_task, seed=args.seed, n_tasks=1, modify_init_state_dist=args.modify_init_state_dist if 'modify_init_state_dist' in args else False, on_circle_init_state=args.on_circle_init_state if 'on_circle_init_state' in args else True) unwrapped_env = env.unwrapped unwrapped_env.goals = np.array([task]) if isinstance(env.action_space, gym.spaces.discrete.Discrete): args.action_dim = 1 else: args.action_dim = env.action_space.shape[0] args.obs_dim = env.observation_space.shape[0] args.max_trajectory_len = unwrapped_env._max_episode_steps args.max_trajectory_len *= args.max_rollouts_per_task args.act_space = env.action_space agent = load_agent(args, agent_path) policy_storage = MultiTaskPolicyStorage( max_replay_buffer_size=int(args.policy_buffer_size), obs_dim=args.obs_dim, action_space=env.action_space, tasks=[0], trajectory_len=args.max_trajectory_len, num_reward_arrays=1, reward_types=[], ) collect_rollouts_per_task(0, agent, policy_storage, env, args.num_rollouts) return policy_storage
def load_dataset(data_dir, args, num_tasks=None, allow_dense_data_loading=True, arr_type='tensor'): dataset = [] env_dir = args.env_name.replace('Sparse', '') \ if 'dense_train_sparse_test' in args and \ args.dense_train_sparse_test is True and \ allow_dense_data_loading \ else args.env_name exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir) goals = [] all_dirs = os.listdir(exps_dir) if num_tasks is None: tasks = np.random.permutation(len(all_dirs)) else: tasks = np.random.choice(len(all_dirs), num_tasks) for i, task in enumerate(tasks): exp_dir = os.path.join(exps_dir, all_dirs[task]) goals.append(extract_goal_from_path(all_dirs[task])) if 'rewards.npy' not in os.listdir(exp_dir): print('rewards.npy file doesn\'t exist. Creating it..') env = make_env(args.env_name, args.max_rollouts_per_task, n_tasks=1) create_rewards_arr(env, path=exp_dir) print('Created rewards.npy file.') obs, actions, rewards, next_obs, terminals = load_transitions(exp_dir) if obs.dim() < 3: obs = obs.reshape(-1, args.trajectory_len, obs.shape[-1]).transpose(0, 1) actions = actions.reshape(-1, args.trajectory_len, actions.shape[-1]).transpose(0, 1) rewards = rewards.reshape(-1, args.trajectory_len, rewards.shape[-1]).transpose(0, 1) next_obs = next_obs.reshape(-1, args.trajectory_len, next_obs.shape[-1]).transpose(0, 1) terminals = terminals.reshape(-1, args.trajectory_len, terminals.shape[-1]).transpose(0, 1) if args.num_trajs_per_task is not None: obs = obs[:, :args.num_trajs_per_task, :] actions = actions[:, :args.num_trajs_per_task, :] rewards = rewards[:, :args.num_trajs_per_task, :] next_obs = next_obs[:, :args.num_trajs_per_task, :] terminals = terminals[:, :args.num_trajs_per_task, :] else: if args.num_trajs_per_task is not None: obs = obs[:, :args.num_trajs_per_task, :] actions = actions[:, :args.num_trajs_per_task, :] rewards = rewards[:, :args.num_trajs_per_task, :] next_obs = next_obs[:, :args.num_trajs_per_task, :] terminals = terminals[:, :args.num_trajs_per_task, :] obs = obs.transpose(0, 1).reshape(-1, obs.shape[-1]) actions = actions.transpose(0, 1).reshape(-1, actions.shape[-1]) rewards = rewards.transpose(0, 1).reshape(-1, rewards.shape[-1]) next_obs = next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1]) terminals = terminals.transpose(0, 1).reshape(-1, terminals.shape[-1]) if arr_type == 'numpy': obs = ptu.get_numpy(obs) actions = ptu.get_numpy(actions) rewards = ptu.get_numpy(rewards) next_obs = ptu.get_numpy(next_obs) terminals = ptu.get_numpy(terminals) dataset.append([obs, actions, rewards, next_obs, terminals]) # print(exp_dir) # print('Obs shape: ' + str(np.shape(dataset[-1][0])) + # '. Act shape: ' + str(np.shape(dataset[-1][1])) + # '. Reward shape: ' + str(np.shape(dataset[-1][2])) + # '. Next obs shape: ' + str(np.shape(dataset[-1][3]))) print('{} experiments loaded.'.format(i + 1)) goals = np.vstack(goals) return dataset, goals
def _borel( log_dir, pretrained_vae_dir, env_type, transform_data_bamdp, seed, path_length, meta_episode_len, relabelled_data_dir=None, offline_buffer_path_to_save_to=None, offline_buffer_path='', saved_tasks_path='', debug=False, vae_model_name=None, load_buffer_kwargs=None, gpu_id=0, **kwargs, ): if load_buffer_kwargs is None: load_buffer_kwargs = {} random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) parser = argparse.ArgumentParser() torch.autograd.set_detect_anomaly(True) if offline_buffer_path_to_save_to is None: offline_buffer_path_to_save_to = os.path.join(log_dir, 'transformed_data') # parser.add_argument('--env-type', default='gridworld') # parser.add_argument('--env-type', default='point_robot_sparse') # parser.add_argument('--env-type', default='cheetah_vel') parser.add_argument('--env-type', default=env_type) extra_args = [] for k, v in kwargs.items(): extra_args.append('--{}'.format(k)) extra_args.append(str(v)) args, rest_args = parser.parse_known_args(args=extra_args) args = env_name_to_args[env_type].get_args(rest_args) set_gpu_mode(torch.cuda.is_available() and args.use_gpu, gpu_id=gpu_id) if vae_model_name is None: vae_model_name = os.listdir( os.path.join(pretrained_vae_dir, args.env_name) )[0] vae_args = config_utl.load_config_file(os.path.join(pretrained_vae_dir, args.env_name, vae_model_name, 'online_config.json')) args = config_utl.merge_configs(vae_args, args) # order of input to this function is important # _, env = off_utl.expand_args(args) from environments.make_env import make_env task_data = joblib.load(saved_tasks_path) tasks = task_data['tasks'] args.presampled_tasks = tasks env = make_env(args.env_name, args.max_rollouts_per_task, presampled_tasks=tasks, seed=args.seed)#, # n_tasks=1) args.vae_dir = pretrained_vae_dir args.data_dir = None args.vae_model_name = vae_model_name if transform_data_bamdp: # Transform data BAMDP (state relabelling) # load VAE for state relabelling print("performing state-relabeling") vae_models_path = os.path.join(pretrained_vae_dir, args.env_name, vae_model_name, 'models') vae = VAE(args) off_utl.load_trained_vae(vae, vae_models_path) # load data and relabel os.makedirs(offline_buffer_path_to_save_to, exist_ok=True) dataset, goals = off_utl.load_pearl_buffer( offline_buffer_path, tasks, add_done_info=env.add_done_info, path_length=path_length, meta_episode_len=meta_episode_len, **load_buffer_kwargs ) dataset = [[x.astype(np.float32) for x in d] for d in dataset] bamdp_dataset = off_utl.transform_mdps_ds_to_bamdp_ds(dataset, vae, args) # save relabelled data print("saving state-relabeled data to ", offline_buffer_path_to_save_to) off_utl.save_dataset(offline_buffer_path_to_save_to, bamdp_dataset, goals) relabelled_data_dir = offline_buffer_path_to_save_to args.relabelled_data_dir = relabelled_data_dir args.max_rollouts_per_task = 3 args.results_log_dir = log_dir if debug: print("DEBUG MODE ON") args.rl_updates_per_iter = 1 args.log_interval = 1 learner = OfflineMetaLearner(args) learner.train()
def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialise environment self.env = make_env( self.args.env_name, self.args.max_rollouts_per_task, seed=self.args.seed, n_tasks=1, modify_init_state_dist=self.args.modify_init_state_dist if 'modify_init_state_dist' in self.args else False, on_circle_init_state=self.args.on_circle_init_state if 'on_circle_init_state' in self.args else True) # saving buffer with task in name folder if hasattr(self.args, 'save_buffer') and self.args.save_buffer: env_dir = os.path.join(self.args.main_save_dir, '{}'.format(self.args.env_name)) goal = self.env.unwrapped._goal self.output_dir = os.path.join( env_dir, self.args.save_dir, 'seed_{}_'.format(self.args.seed) + off_utl.create_goal_path_ext_from_goal(goal)) if self.args.save_models or self.args.save_buffer: os.makedirs(self.output_dir, exist_ok=True) config_utl.save_config_file(args, self.output_dir) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) # if not self.args.log_tensorboard: # self.save_config_json_file() # unwrapped env to get some info about the environment unwrapped_env = self.env.unwrapped # calculate what the maximum length of the trajectories is args.max_trajectory_len = unwrapped_env._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task self.args.max_trajectory_len = args.max_trajectory_len # get action / observation dimensions if isinstance(self.env.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.env.action_space.shape[0] self.args.obs_dim = self.env.observation_space.shape[0] self.args.num_states = unwrapped_env.num_states if hasattr( unwrapped_env, 'num_states') else None self.args.act_space = self.env.action_space # simulate env step to get reward types _, _, _, info = unwrapped_env.step(unwrapped_env.action_space.sample()) reward_types = [ reward_type for reward_type in list(info.keys()) if reward_type.startswith('reward') ] # support dense rewards training (if exists) self.args.dense_train_sparse_test = self.args.dense_train_sparse_test \ if 'dense_train_sparse_test' in self.args else False # initialize policy self.initialize_policy() # initialize buffer for RL updates self.policy_storage = MultiTaskPolicyStorage( max_replay_buffer_size=int(self.args.policy_buffer_size), obs_dim=self.args.obs_dim, action_space=self.env.action_space, tasks=[0], trajectory_len=args.max_trajectory_len, num_reward_arrays=len(reward_types) if reward_types and self.args.dense_train_sparse_test else 1, reward_types=reward_types, ) self.args.belief_reward = False # initialize arg to not use belief rewards