def __init__(self, env, policy, policy_lr=1e-3, optimizer_class=optim.Adam, causality=True, discounted=False, plotter=None, eval_deterministic=True, **kwargs): """ Args: env: qf (`robolearn.PyTorchModule`): Q-function approximator. policy (`robolearn.PyTorchModule`): policy_lr (`float`): Learning rate used for the Policy approximator. plotter (`MultiQFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. eval_deterministic: Evaluate with deterministic version of current _i_policy. **kwargs: """ if eval_deterministic: eval_policy = MakeDeterministic(policy) else: eval_policy = policy super(Reinforce, self).__init__(env=env, exploration_policy=policy, eval_policy=eval_policy, **kwargs) self.policy = policy self.plotter = plotter # Env data self._action_dim = self.explo_env.action_space.low.size self._obs_dim = self.explo_env.observation_space.low.size # Optimize Policy self.policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, ) # Return computation self._causality = causality self.discounted = discounted
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print('Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un) ) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params.pop('goal', None) env_params['is_render'] = True if args.subtask and args.un != -1: env_params['subtask'] = args.un env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('pusher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) # plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def __init__( self, explo_env, policy, qf, replay_buffer, batch_size=1024, normalize_obs=False, eval_env=None, vf=None, qf2=None, action_prior='uniform', entropy_scale=1., auto_alpha=True, tgt_entro=None, policy_lr=3e-4, qf_lr=3e-4, policy_mean_regu_weight=1e-3, policy_std_regu_weight=1e-3, policy_pre_activation_weight=0., policy_weight_decay=0., q_weight_decay=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, soft_target_tau=5e-3, target_update_interval=1, reward_scale=1., save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=explo_env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, explo_env=explo_env, explo_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Q-function(s) and V-function self._qf = qf self._qf2 = qf2 if vf is None: self._vf = None self._target_vf = None self._target_qf1 = qf.copy() self._target_qf2 = None if qf2 is None else qf2.copy() else: self._vf = vf self._target_vf = vf.copy() self._target_qf1 = None self._target_qf2 = None # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # Soft-update rate for target V-function self._soft_target_tau = soft_target_tau self._target_update_interval = target_update_interval # Important algorithm hyperparameters self._action_prior = action_prior self._entropy_scale = entropy_scale # Desired Alpha self._auto_alpha = auto_alpha if tgt_entro is None: tgt_entro = -explo_env.action_dim self._tgt_entro = torch.tensor([float(tgt_entro)], device=ptu.device) self._log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True) # Reward Scale self.reward_scale = reward_scale # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') self.qf_lr = qf_lr self.policy_lr = policy_lr # Q-function(s) optimizer(s) self._qf1_optimizer = optimizer_class(self._qf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = self._qf.parameters() if self._qf2 is None: self._qf2_optimizer = None else: self._qf2_optimizer = optimizer_class(self._qf2.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = chain(values_parameters, self._qf2.parameters()) # V-function optimizer if self._vf is None: self._vf_optimizer = None else: self._vf_optimizer = optimizer_class(self._vf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = chain(values_parameters, self._vf.parameters()) self._values_optimizer = optimizer_class(values_parameters, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class( self._policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizer self._alpha_optimizer = optimizer_class([self._log_alpha], lr=policy_lr, **optimizer_kwargs) # Weights for policy regularization coefficients self.pol_mean_regu_weight = policy_mean_regu_weight self.pol_std_regu_weight = policy_std_regu_weight self.pol_pre_activation_weight = policy_pre_activation_weight # Useful Variables for logging self.log_data = dict() self.log_data['Pol KL Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Qf2 Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Vf Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Rewards'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Pol Entropy'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Pol Log Std'] = np.zeros(( self.num_train_steps_per_epoch, self.explo_env.action_dim, )) self.log_data['Policy Mean'] = np.zeros(( self.num_train_steps_per_epoch, self.explo_env.action_dim, )) self.log_data['Alphas'] = np.zeros(self.num_train_steps_per_epoch) # Tensorboard-like Logging self._log_tensorboard = log_tensorboard if log_tensorboard: self._summary_writer = \ tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir()) else: self._summary_writer = None
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params['is_render'] = True if 'obs_mean' in data.keys(): obs_mean = data['obs_mean'] print('OBS_MEAN') print(repr(obs_mean)) else: obs_mean = None # obs_mean = np.array([ 0.07010766, 0.37585765, 0.21402615, 0.24426296, 0.5789634 , # 0.88510203, 1.6878743 , 0.02656335, 0.03794186, -1.0241051 , # -0.5226027 , 0.6198239 , 0.49062446, 0.01197532, 0.7888951 , # -0.4857273 , 0.69160587, -0.00617676, 0.08966777, -0.14694819, # 0.9559917 , 1.0450271 , -0.40958315, 0.86435956, 0.00609685, # -0.01115279, -0.21607827, 0.9762933 , 0.80748135, -0.48661205, # 0.7473679 , 0.01649722, 0.15451911, -0.17285274, 0.89978695]) if 'obs_var' in data.keys(): obs_var = data['obs_var'] print('OBS_VAR') print(repr(obs_var)) else: obs_var = None # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407 , 0.8994803 , # 0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 , # 0.9462659 , 2.245269 , 0.84190637, 1.5407104 , 0.1 , # 0.10330457, 0.1 , 0.1 , 0.1 , 0.1528581 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.12320185, # 0.1 , 0.18369523, 0.200373 , 0.11895574, 0.15118493]) print(env_params) if args.subtask and args.un != -1: env_params['subtask'] = args.un # else: # env_params['subtask'] = None env = NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('centauro_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env_params['is_render'] = True env = NormalizedBoxEnv( Reacher2D3DofBulletEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('reacher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') path = rollout( env, policy, max_path_length=args.H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def __init__( self, env, policy, qf, replay_buffer, normalize_obs=False, eval_env=None, action_prior='uniform', entropy_scale=1., policy_lr=1e-4, qf_lr=1e-3, policy_weight_decay=0, qf_weight_decay=0, residual_gradient_weight=0, epoch_discount_schedule=None, policy_mean_regu_weight=1e-3, policy_std_regu_weight=1e-3, policy_pre_activation_weight=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, target_hard_update_period=1000, tau=1e-2, use_soft_update=False, save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, env=env, exploration_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Important algorithm hyperparameters self._action_prior = action_prior self._entropy_scale = entropy_scale # Q-function self._qf = qf # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Q-function(s) optimizer(s) self._qf_optimizer = optimizer_class(self._qf.parameters(), lr=qf_lr, weight_decay=0, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class(self._policy.parameters(), lr=policy_lr, weight_decay=0, **optimizer_kwargs) # Policy regularization coefficients (weights) self._policy_mean_regu_weight = policy_mean_regu_weight self._policy_std_regu_weight = policy_std_regu_weight self._policy_pre_activation_weight = policy_pre_activation_weight # Useful Variables for logging self.logging_pol_kl_loss = np.zeros(self.num_train_steps_per_epoch) self.logging_qf_loss = np.zeros(self.num_train_steps_per_epoch) self.logging_rewards = np.zeros(self.num_train_steps_per_epoch) self.logging_policy_entropy = np.zeros(self.num_train_steps_per_epoch) self.logging_policy_log_std = np.zeros( (self.num_train_steps_per_epoch, self.explo_env.action_dim)) self.logging_policy_mean = np.zeros( (self.num_train_steps_per_epoch, self.explo_env.action_dim)) self._log_tensorboard = log_tensorboard self._summary_writer = tensorboardX.SummaryWriter( log_dir=logger.get_snapshot_dir())
def __init__(self, env, qf, policy, qf_lr=1e-3, policy_lr=1e-3, optimizer_class=optim.Adam, use_hard_updates=False, hard_update_period=1000, soft_target_tau=0.001, value_n_particles=16, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, epoch_plotter=None, eval_deterministic=True, **kwargs): """ Args: env: qf (`robolearn.PyTorchModule`): Q-function approximator. policy (`robolearn.PyTorchModule`): qf_lr (`float`): Learning rate used for the Q-function approximator. use_hard_updates (`bool`): Use a hard rather than soft update. hard_update_period (`int`): How many gradient steps before copying the parameters over. Used if `use_hard_updates` is True. soft_target_tau (`float`): Soft target tau to update target QF. Used if `use_hard_updates` is False. value_n_particles (`int`): The number of action samples used for estimating the value of next state. kernel_fn (function object): A function object that represents a kernel function. kernel_n_particles (`int`): Total number of particles per state used in SVGD updates. epoch_plotter (`MultiQFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. eval_deterministic: Evaluate with deterministic version of current _i_policy. **kwargs: """ if eval_deterministic: eval_policy = MakeDeterministic(policy) else: eval_policy = policy super(SQL, self).__init__(env=env, exploration_policy=policy, eval_policy=eval_policy, **kwargs) self.policy = policy self.qf = qf self.target_qf = self.qf.copy() self._epoch_plotter = epoch_plotter # Env data self._action_dim = self.explo_env.action_space.low.size self._obs_dim = self.explo_env.observation_space.low.size # Optimize Q-fcn self.qf_optimizer = optimizer_class( self.qf.parameters(), lr=qf_lr, ) self._value_n_particles = value_n_particles # Optimize Policy self.policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, ) self._kernel_n_particles = kernel_n_particles self._kernel_update_ratio = kernel_update_ratio self._kernel_fn = kernel_fn # Optimize target Q-fcn self.use_hard_updates = use_hard_updates self.hard_update_period = hard_update_period self.soft_target_tau = soft_target_tau
def __init__( self, env, policy, u_qf1, replay_buffer, batch_size=1024, normalize_obs=False, eval_env=None, i_qf1=None, u_qf2=None, i_qf2=None, i_vf=None, u_vf=None, action_prior='uniform', i_entropy_scale=1., u_entropy_scale=None, auto_alpha=True, i_tgt_entro=None, u_tgt_entros=None, policy_lr=3e-4, qf_lr=3e-4, i_policy_mean_regu_weight=1e-3, i_policy_std_regu_weight=1e-3, i_policy_pre_activation_weight=0., i_policy_mixing_coeff_weight=1e-3, u_policy_mean_regu_weight=None, u_policy_std_regu_weight=None, u_policy_pre_activation_weight=None, policy_weight_decay=0., q_weight_decay=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, i_soft_target_tau=5e-3, u_soft_target_tau=5e-3, i_target_update_interval=1, u_target_update_interval=1, reward_scale=1., u_reward_scales=None, save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, explo_env=env, explo_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Number of Unintentional Tasks (Composable Tasks) self._n_unintentional = self._policy.n_heads # Evaluation Sampler (One for each unintentional) self.eval_u_samplers = [ InPlacePathSampler( env=env, policy=WeightedMultiPolicySelector(self._policy, idx), total_samples=self.num_steps_per_eval, max_path_length=self.max_path_length, deterministic=True, ) for idx in range(self._n_unintentional) ] # Intentional (Main Task) Q-functions self._i_qf1 = i_qf1 self._i_qf2 = i_qf2 if i_vf is None: self._i_vf = None self._i_target_vf = None self._i_target_qf1 = self._i_qf1.copy() self._i_target_qf2 = \ None if self._i_qf2 is None else self._i_qf2.copy() else: self._i_vf = i_vf self._i_target_vf = self._i_vf.copy() self._i_target_qf1 = None self._i_target_qf2 = None # Unintentional (Composable Tasks) Q-functions self._u_qf1 = u_qf1 self._u_qf2 = u_qf2 if u_vf is None: self._u_vf = None self._u_target_vf = None self._u_target_qf1 = self._u_qf1.copy() self._u_target_qf2 = self._u_qf2.copy() else: self._u_vf = u_vf self._u_target_vf = self._u_vf.copy() self._u_target_qf1 = None self._u_target_qf2 = None # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # Soft-update rate for target V-functions self._i_soft_target_tau = i_soft_target_tau self._u_soft_target_tau = u_soft_target_tau self._i_target_update_interval = i_target_update_interval self._u_target_update_interval = u_target_update_interval # Important algorithm hyperparameters self._action_prior = action_prior self._i_entropy_scale = i_entropy_scale if u_entropy_scale is None: u_entropy_scale = [ i_entropy_scale for _ in range(self._n_unintentional) ] self._u_entropy_scale = torch.tensor(u_entropy_scale, dtype=torch.float32, device=ptu.device) # Desired Alphas self._auto_alphas = auto_alpha if i_tgt_entro is None: i_tgt_entro = -env.action_dim self._i_tgt_entro = torch.tensor([i_tgt_entro], dtype=torch.float32, device=ptu.device) if u_tgt_entros is None: u_tgt_entros = [i_tgt_entro for _ in range(self._n_unintentional)] self._u_tgt_entros = torch.tensor(u_tgt_entros, dtype=torch.float32, device=ptu.device) self._u_log_alphas = torch.zeros(self._n_unintentional, device=ptu.device, requires_grad=True) self._i_log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True) # Reward Scales self.reward_scale = reward_scale if u_reward_scales is None: reward_scale = kwargs['reward_scale'] u_reward_scales = [ reward_scale for _ in range(self._n_unintentional) ] self._u_reward_scales = torch.tensor(u_reward_scales, dtype=torch.float32, device=ptu.device) # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Values optimizer vals_params_list = [self._u_qf1.parameters(), self._i_qf1.parameters()] if self._u_qf2 is not None: vals_params_list.append(self._u_qf2.parameters()) if self._i_qf2 is not None: vals_params_list.append(self._i_qf2.parameters()) if self._u_vf is not None: vals_params_list.append(self._u_vf.parameters()) if self._i_vf is not None: vals_params_list.append(self._i_vf.parameters()) vals_params = chain(*vals_params_list) self._values_optimizer = optimizer_class(vals_params, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class( self._policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizers self._alphas_optimizer = optimizer_class( [self._u_log_alphas, self._i_log_alpha], lr=policy_lr, **optimizer_kwargs) # Weights for policy regularization coefficients self._i_pol_mean_regu_weight = i_policy_mean_regu_weight self._i_pol_std_regu_weight = i_policy_std_regu_weight self._i_pol_pre_activ_weight = i_policy_pre_activation_weight self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight if u_policy_mean_regu_weight is None: u_policy_mean_regu_weight = [ i_policy_mean_regu_weight for _ in range(self._n_unintentional) ] self._u_policy_mean_regu_weight = \ torch.tensor(u_policy_mean_regu_weight, dtype=torch.float32, device=ptu.device) if u_policy_std_regu_weight is None: u_policy_std_regu_weight = [ i_policy_std_regu_weight for _ in range(self._n_unintentional) ] self._u_policy_std_regu_weight = \ torch.tensor(u_policy_std_regu_weight, dtype=torch.float32, device=ptu.device) if u_policy_pre_activation_weight is None: u_policy_pre_activation_weight = [ i_policy_pre_activation_weight for _ in range(self._n_unintentional) ] self._u_policy_pre_activ_weight = \ torch.tensor(u_policy_pre_activation_weight, dtype=torch.float32, device=ptu.device) # Useful Variables for logging self.log_data = dict() self.log_data['Pol KL Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Qf Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Qf2 Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Vf Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Rewards'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Policy Entropy'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Policy Mean'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, self.explo_env.action_dim, )) self.log_data['Pol Log Std'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, self.explo_env.action_dim, )) self.log_data['Mixing Weights'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional, self.explo_env.action_dim, )) self.log_data['Alphas'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) # Tensorboard-like Logging self._log_tensorboard = log_tensorboard if log_tensorboard: self._summary_writer = \ tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir()) else: self._summary_writer = None
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( # MultiPolicySelector(data['u_policy'], args.un)) WeightedMultiPolicySelector(data['policy'], args.un)) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') policy = MakeDeterministic(data['policy']) else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['policy'], args.un) else: # policy = data['u_policies'][args.un] policy = WeightedMultiPolicySelector(data['policy'], args.un) else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params)) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()