def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded") env = data['env'] if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def pretrain(self, n_pretrain_samples): if ( self.num_paths_for_normalization == 0 or (self.obs_normalizer is None and self.action_normalizer is None) ): observation = self.explo_env.reset() for ii in range(n_pretrain_samples): action = self.explo_env.action_space.sample() # Interact with environment next_ob, reward, terminal, env_info = ( self.explo_env.step(action) ) agent_info = None # Increase counter self._n_env_steps_total += 1 # Create np.array of obtained terminal and reward terminal = np.array([terminal]) reward = np.array([reward]) # Add to replay buffer self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_ob, agent_info=agent_info, env_info=env_info, ) observation = next_ob if self._obs_normalizer is not None: self._obs_normalizer.update(np.array([observation])) if terminal: self.explo_env.reset() else: pretrain_paths = [] random_policy = RandomPolicy(self.explo_env.action_space) while len(pretrain_paths) < self.num_paths_for_normalization: path = rollout(self.explo_env, random_policy, self.max_path_length) pretrain_paths.append(path) ob_mean, ob_std, ac_mean, ac_std = ( compute_normalization(pretrain_paths) ) if self.obs_normalizer is not None: self.obs_normalizer.set_mean(ob_mean) self.obs_normalizer.set_std(ob_std) self._target_qf.obs_normalizer = self.obs_normalizer self._target_policy.obs_normalizer = self.obs_normalizer if self.action_normalizer is not None: self.action_normalizer.set_mean(ac_mean) self.action_normalizer.set_std(ac_std) self._target_qf.action_normalizer = self.action_normalizer self._target_policy.action_normalizer = self.action_normalizer
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( # MultiPolicySelector(data['u_policy'], args.un)) WeightedMultiPolicySelector(data['policy'], args.un)) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') policy = MakeDeterministic(data['policy']) else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['policy'], args.un) else: # policy = data['u_policies'][args.un] policy = WeightedMultiPolicySelector(data['policy'], args.un) else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params)) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] # env = data['env'] env = NormalizedBoxEnv(gym.make(args.env)) print("Environment loaded!!") # # Load environment # with open('variant.json') as json_data: # env_params = json.load(json_data)['env_params'] # env_params.pop('goal') # env_params['is_render'] = True # env = NormalizedBoxEnv(args.env(**env_params)) # print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() # else: # set_gpu_mode(False) # policy.cpu() if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: env.start_recording_video('prueba.mp4') path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) print('Accum reward is: ', path['rewards'].sum()) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: env.stop_recording_video() break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print('Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un) ) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params.pop('goal', None) env_params['is_render'] = True if args.subtask and args.un != -1: env_params['subtask'] = args.un env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('pusher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) # plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params['is_render'] = True if 'obs_mean' in data.keys(): obs_mean = data['obs_mean'] print('OBS_MEAN') print(repr(obs_mean)) else: obs_mean = None # obs_mean = np.array([ 0.07010766, 0.37585765, 0.21402615, 0.24426296, 0.5789634 , # 0.88510203, 1.6878743 , 0.02656335, 0.03794186, -1.0241051 , # -0.5226027 , 0.6198239 , 0.49062446, 0.01197532, 0.7888951 , # -0.4857273 , 0.69160587, -0.00617676, 0.08966777, -0.14694819, # 0.9559917 , 1.0450271 , -0.40958315, 0.86435956, 0.00609685, # -0.01115279, -0.21607827, 0.9762933 , 0.80748135, -0.48661205, # 0.7473679 , 0.01649722, 0.15451911, -0.17285274, 0.89978695]) if 'obs_var' in data.keys(): obs_var = data['obs_var'] print('OBS_VAR') print(repr(obs_var)) else: obs_var = None # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407 , 0.8994803 , # 0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 , # 0.9462659 , 2.245269 , 0.84190637, 1.5407104 , 0.1 , # 0.10330457, 0.1 , 0.1 , 0.1 , 0.1528581 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.12320185, # 0.1 , 0.18369523, 0.200373 , 0.11895574, 0.15118493]) print(env_params) if args.subtask and args.un != -1: env_params['subtask'] = args.un # else: # env_params['subtask'] = None env = NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('centauro_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env_params['is_render'] = True env = NormalizedBoxEnv( Reacher2D3DofBulletEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('reacher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') path = rollout( env, policy, max_path_length=args.H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break