def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded") env = data['env'] if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(Pusher2D3DofGoalCompoEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional) ] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [ StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional) ] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( # MultiPolicySelector(data['u_policy'], args.un)) WeightedMultiPolicySelector(data['policy'], args.un)) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') policy = MakeDeterministic(data['policy']) else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['policy'], args.un) else: # policy = data['u_policies'][args.un] policy = WeightedMultiPolicySelector(data['policy'], args.un) else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params)) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size]) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = DDPG( explo_env=env, # training_env=env, save_environment=False, policy=policy, explo_policy=exploration_policy, qf=qf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) # env = NormalizedBoxEnv( # Reacher2D3DofBulletEnv(**variant['env_params']) # ) env = Reacher2D3DofBulletEnv(**variant['env_params']) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) initial_conds = [ [10, 5, 20, 0.2, 0.5, 0], [10, 5, 20, 0.1, 0.1, 0], [10, 5, 20, 0.15, 0.8, 0], ] for init_cond in initial_conds: env.add_initial_condition(robot_config=np.deg2rad(init_cond[:3]), tgt_state=init_cond[-3:]) net_size = variant['net_size'] # global_policy = TanhGaussianPolicy( global_policy = MlpPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) local_policies = [ LinearGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, T=PATH_LENGTH, ) for _ in range(N_LOCAL_POLS) ] # # replay_buffer = FakeReplayBuffer() # variant['algo_params']['replay_buffer'] = replay_buffer # # # QF Plot # # variant['algo_params']['epoch_plotter'] = None algorithm = MDGPS(env=env, eval_env=env, save_environment=False, local_policies=local_policies, global_policy=global_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv(Pusher2D3DofMultiGoalEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = SQL( env=env, training_env=env, save_environment=False, qf=qf, policy=policy, # algo_interface='torch', **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] # env = data['env'] env = NormalizedBoxEnv(gym.make(args.env)) print("Environment loaded!!") # # Load environment # with open('variant.json') as json_data: # env_params = json.load(json_data)['env_params'] # env_params.pop('goal') # env_params['is_render'] = True # env = NormalizedBoxEnv(args.env(**env_params)) # print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() # else: # set_gpu_mode(False) # policy.cpu() if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: env.start_recording_video('prueba.mp4') path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) print('Accum reward is: ', path['rewards'].sum()) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: env.stop_recording_video() break
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = Reinforce( env=env, # training_env=env, save_environment=False, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] i_qf2 = data['qf2'] u_qf = data['u_qf'] u_qf2 = data['u_qf2'] i_vf = data['i_vf'] u_vf = data['u_vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: u_qf2 = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_qf2 = None i_qf2 = None if EXPLICIT_VF: u_vf = NNMultiVFunction( obs_dim=obs_dim, n_vs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_vf = NNVFunction( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_vf = None i_vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUSAC( env=env, policy=policy, u_qf1=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf1=i_qf, u_qf2=u_qf2, i_qf2=i_qf2, u_vf=u_vf, i_vf=i_vf, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) # algorithm.pretrain(10000) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu']) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] u_qf = data['u_qf'] policy = data['policy'] exploration_policy = data['exploration_policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUDDPG(env=env, policy=policy, explo_policy=exploration_policy, u_qf=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf=i_qf, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): render_q = variant['render_q'] save_q_path = '/home/desteban/logs/goalcompo_q_plots' ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']) ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional)] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional)] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim,) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot goal_pos = expt_variant['env_params']['goal_position'] q_fcn_positions = [ (goal_pos[0], 0.0), (0.0, 0.0), (0.0, goal_pos[1]) ] plotter = QFPolicyPlotter( i_qf=i_qf, i_policy=i_policy, u_qfs=u_qfs, u_policies=u_policies, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL( env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
import argparse # np.set_printoptions(precision=3, suppress=True) TEND = 4.0 SIM_TIMESTEP = 0.01 FRAME_SKIP = 1 TS = FRAME_SKIP * SIM_TIMESTEP T = int(TEND/TS) GPU = True # GPU = False SEED = 450 ptu.set_gpu_mode(GPU) np.random.seed(SEED) ptu.seed(SEED) noise_hyperparams = dict( smooth_noise=True, # Apply Gaussian filter to noise generated smooth_noise_var=2.0e+0, # np.power(2*Ts, 2), # Variance to apply to Gaussian Filter. In Kumar (2016) paper, it is the std dev of 2 Ts smooth_noise_renormalize=True, # Renormalize smooth noise to have variance=1 noise_var_scale=1.e-5*np.array([1., 1., 1., 1., .1, 0.1, 0.1]), # Scale to Gaussian noise: N(0, 1)*sqrt(noise_var_scale), only if smooth_noise_renormalize ) algo_params = dict( seed=SEED, nepochs=100, num_samples=3,
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = PPO( env=env, policy=policy, qf=qf, # replay_buffer=replay_buffer, # batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params['is_render'] = True if 'obs_mean' in data.keys(): obs_mean = data['obs_mean'] print('OBS_MEAN') print(repr(obs_mean)) else: obs_mean = None # obs_mean = np.array([ 0.07010766, 0.37585765, 0.21402615, 0.24426296, 0.5789634 , # 0.88510203, 1.6878743 , 0.02656335, 0.03794186, -1.0241051 , # -0.5226027 , 0.6198239 , 0.49062446, 0.01197532, 0.7888951 , # -0.4857273 , 0.69160587, -0.00617676, 0.08966777, -0.14694819, # 0.9559917 , 1.0450271 , -0.40958315, 0.86435956, 0.00609685, # -0.01115279, -0.21607827, 0.9762933 , 0.80748135, -0.48661205, # 0.7473679 , 0.01649722, 0.15451911, -0.17285274, 0.89978695]) if 'obs_var' in data.keys(): obs_var = data['obs_var'] print('OBS_VAR') print(repr(obs_var)) else: obs_var = None # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407 , 0.8994803 , # 0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 , # 0.9462659 , 2.245269 , 0.84190637, 1.5407104 , 0.1 , # 0.10330457, 0.1 , 0.1 , 0.1 , 0.1528581 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.12320185, # 0.1 , 0.18369523, 0.200373 , 0.11895574, 0.15118493]) print(env_params) if args.subtask and args.un != -1: env_params['subtask'] = args.un # else: # env_params['subtask'] = None env = NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('centauro_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Reacher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: qf2 = None if EXPLICIT_VF: vf = NNVFunction( obs_dim=obs_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['v_hidden_w_init'], output_w_init=variant['v_output_w_init'], ) else: vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, qf2=qf2, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) else: qf2 = None vf = NNVFunction( obs_dim=obs_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) # # Clamp model parameters # qf.clamp_all_params(min=-0.003, max=0.003) # vf.clamp_all_params(min=-0.003, max=0.003) # policy.clamp_all_params(min=-0.003, max=0.003) # if USE_Q2: # qf2.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, qf2=qf2, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env_params['is_render'] = True env = NormalizedBoxEnv( Reacher2D3DofBulletEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('reacher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') path = rollout( env, policy, max_path_length=args.H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print('Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un) ) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params.pop('goal', None) env_params['is_render'] = True if args.subtask and args.un != -1: env_params['subtask'] = args.un env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('pusher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) # plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def run_experiment_here( experiment_function, variant=None, exp_id=0, seed=0, use_gpu=True, # Logger params: exp_prefix="default", snapshot_mode='last', snapshot_gap=1, git_info=None, script_name=None, base_log_dir=None, log_dir=None, ): """ Run an experiment locally without any serialization. :param experiment_function: Function. `variant` will be passed in as its only argument. :param exp_prefix: Experiment prefix for the save file. :param variant: Dictionary passed in to `experiment_function`. :param exp_id: Experiment ID. Should be unique across all experiments. Note that one experiment may correspond to multiple seeds,. :param seed: Seed used for this experiment. :param use_gpu: Run with GPU. By default False. :param script_name: Name of the running script :param log_dir: If set, set the log directory to this. Otherwise, the directory will be auto-generated based on the exp_prefix. :return: """ if variant is None: variant = {} variant['exp_id'] = str(exp_id) if seed is None and 'seed' not in variant: seed = random.randint(0, 100000) variant['seed'] = str(seed) reset_execution_environment() actual_log_dir = setup_logger( exp_prefix=exp_prefix, variant=variant, exp_id=exp_id, seed=seed, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, base_log_dir=base_log_dir, log_dir=log_dir, git_info=git_info, script_name=script_name, ) set_seed(seed) set_gpu_mode(use_gpu) run_experiment_here_kwargs = dict( variant=variant, exp_id=exp_id, seed=seed, use_gpu=use_gpu, exp_prefix=exp_prefix, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, git_info=git_info, script_name=script_name, base_log_dir=base_log_dir, ) save_experiment_data( dict(run_experiment_here_kwargs=run_experiment_here_kwargs), actual_log_dir) return experiment_function(variant)