def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: qf2 = None if EXPLICIT_VF: vf = NNVFunction( obs_dim=obs_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['v_hidden_w_init'], output_w_init=variant['v_output_w_init'], ) else: vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, qf2=qf2, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] i_qf2 = data['qf2'] u_qf = data['u_qf'] u_qf2 = data['u_qf2'] i_vf = data['i_vf'] u_vf = data['u_vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: u_qf2 = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_qf2 = None i_qf2 = None if EXPLICIT_VF: u_vf = NNMultiVFunction( obs_dim=obs_dim, n_vs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_vf = NNVFunction( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_vf = None i_vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUSAC( env=env, policy=policy, u_qf1=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf1=i_qf, u_qf2=u_qf2, i_qf2=i_qf2, u_vf=u_vf, i_vf=i_vf, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) # algorithm.pretrain(10000) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = PPO( env=env, policy=policy, qf=qf, # replay_buffer=replay_buffer, # batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu']) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] u_qf = data['u_qf'] policy = data['policy'] exploration_policy = data['exploration_policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUDDPG(env=env, policy=policy, explo_policy=exploration_policy, u_qf=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf=i_qf, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) else: qf2 = None vf = NNVFunction( obs_dim=obs_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) # # Clamp model parameters # qf.clamp_all_params(min=-0.003, max=0.003) # vf.clamp_all_params(min=-0.003, max=0.003) # policy.clamp_all_params(min=-0.003, max=0.003) # if USE_Q2: # qf2.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, qf2=qf2, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm