def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy and environment loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) print('Using GPU') if isinstance(env, VAEWrappedEnv) and hasattr(env, 'mode'): env.mode(args.mode) print('Set environment mode {}'.format(args.mode)) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() paths = [] while True: paths.append( multitask_rollout( env, policy, max_path_length=args.H, render=not args.hide, observation_key='observation', desired_goal_key='desired_goal', )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, ): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = multitask_rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, render=self._render, render_kwargs=self._render_kwargs, observation_key=self._observation_key, desired_goal_key=self._desired_goal_key, return_dict_obs=True, ) path_len = len(path['actions']) if (path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths): break num_steps_collected += path_len paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def obtain_samples(self, rollout_type="multitask"): paths = [] n_steps_total = 0 while n_steps_total + self.max_path_length <= self.max_samples: if self.randomize_env: self.env, env_name = self.alg.get_new_env() print(f"Evaluating {env_name}") if rollout_type == "multitask": path = multitask_rollout( self.env, self.policy, max_path_length=self.max_path_length, animated=False, observation_key='observation', desired_goal_key='desired_goal', get_action_kwargs=dict( return_stacked_softmax=False, mask=np.ones((1, self.env.unwrapped.num_blocks)), deterministic=True ) ) else: path = rollout( self.env, self.policy, max_path_length=self.max_path_length ) paths.append(path) n_steps_total += len(path['observations']) return paths
def rollout(*args, **kwargs): return multitask_rollout( *args, **kwargs, observation_key='latent_observation', desired_goal_key='latent_desired_goal', )
def simulate_policy(args): if args.pause: import ipdb ipdb.set_trace() data = pickle.load(open(args.file, "rb")) policy = data['policy'] env = data['env'] print("Policy and environment loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) if isinstance(env, VAEWrappedEnv): env.mode(args.mode) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() policy.train(False) paths = [] while True: paths.append( multitask_rollout( env, policy, max_path_length=args.H, animated=not args.hide, observation_key='observation', desired_goal_key='desired_goal', )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def eval_multitask_rollout(self): return multitask_rollout( self.env, self.policy, self.max_path_length, observation_key=self.observation_key, desired_goal_key=self.desired_goal_key, )
def rollout_fn(): return multitask_rollout( env, policy, horizon, render, observation_key="observation", desired_goal_key="desired_goal", representation_goal_key="representation_goal", **reset_kwargs, )
def simulate_policy(args): if args.pause: import ipdb ipdb.set_trace() data = pickle.load(open(args.file, "rb")) # joblib.load(args.file) if 'policy' in data: policy = data['policy'] elif 'evaluation/policy' in data: policy = data['evaluation/policy'] if 'env' in data: env = data['env'] elif 'evaluation/env' in data: env = data['evaluation/env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) else: ptu.set_gpu_mode(False) policy.to(ptu.device) if isinstance(env, VAEWrappedEnv): env.mode(args.mode) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() if args.multitaskpause: env.pause_on_goal = True if isinstance(policy, PyTorchModule): policy.train(False) paths = [] while True: paths.append( multitask_rollout( env, policy, max_path_length=args.H, render=not args.hide, observation_key=data.get('evaluation/observation_key', 'observation'), desired_goal_key=data.get('evaluation/desired_goal_key', 'desired_goal'), )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def eval_multitask_rollout(self): get_action_kwargs = dict() # if not hasattr(self, "exploration_masking") or self.exploration_masking: # masks = np.pad(masks, ((0,0), (0, int(self.replay_buffer.max_num_blocks - self.env.unwrapped.num_blocks))), "constant", constant_values=((0,0), (0, 0))) get_action_kwargs['mask'] = get_masks( self.env.unwrapped.num_blocks, self.replay_buffer.max_num_blocks, 1, keepdim=True) return multitask_rollout( self.env, self.policy, self.max_path_length, observation_key=self.observation_key, desired_goal_key=self.desired_goal_key, get_action_kwargs=get_action_kwargs, max_num_blocks=self.replay_buffer.max_num_blocks, cur_num_blocks=self.env.unwrapped.num_blocks)
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() while True: path = multitask_rollout( env, policy, max_path_length=args.H, render=True, observation_key='observation', desired_goal_key='desired_goal', ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def collect_new_paths(self, max_path_length, num_steps): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: path = multitask_rollout( self._env, self._policy, max_path_length=min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ), observation_key=self._observation_key, desired_goal_key=self._desired_goal_key, return_dict_obs=True, ) num_steps_collected += len(path['actions']) paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def simulate_policy(args): # import torch # torch.manual_seed(6199) if args.pause: import ipdb ipdb.set_trace() data = pickle.load(open(args.file, "rb")) policy = data['algorithm'].policy num_blocks = 6 stack_only = True # env = data['env'] env = gym.make( F"FetchBlockConstruction_{num_blocks}Blocks_IncrementalReward_DictstateObs_42Rendersize_{stack_only}Stackonly_AllCase-v1" ) env = Monitor(env, force=True, directory="videos/", video_callable=lambda x: x) print("Policy and environment loaded") if args.gpu: ptu.set_gpu_mode(True) policy.to(ptu.device) if args.enable_render or hasattr(env, 'enable_render'): # some environments need to be reconfigured for visualization env.enable_render() policy.train(False) failures = [] successes = [] for path_idx in range(100): path = multitask_rollout( env, policy, max_path_length=num_blocks * 50, animated=not args.hide, observation_key='observation', desired_goal_key='desired_goal', get_action_kwargs=dict(mask=np.ones((1, num_blocks)), deterministic=True), ) if not is_solved(path, num_blocks): failures.append(path) print(F"Failed {path_idx}") else: print(F"Succeeded {path_idx}") successes.append(path) # if hasattr(env, "log_diagnostics"): # env.log_diagnostics(paths) # if hasattr(env, "get_diagnostics"): # for k, v in env.get_diagnostics(paths).items(): # logger.record_tabular(k, v) # logger.dump_tabular() print(f"Success rate {len(successes)/(len(successes) + len(failures))}") from rlkit.core.eval_util import get_generic_path_information path_info = get_generic_path_information(successes + failures, num_blocks=num_blocks) print(path_info)