def experiment(variant): ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): qf.cuda() policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(Pusher2D3DofGoalCompoEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional) ] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [ StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional) ] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def main(): # Environment Fcn env_fn = lambda: \ NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) # Logger kwargs logger_kwargs = setup_logger_kwargs(EXP_NAME, SEED) with tf.Graph().as_default(): sac( env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(hidden_sizes=(128, 128, 128)), seed=SEED, steps_per_epoch=PATHS_PER_EPOCH * PATH_LENGTH, epochs=EPOCHS, replay_size=int(1e6), gamma=0.99, polyak=0.995, # Polyak avg target pol (0-1) lr=1e-3, alpha=0.2, # entropy regularization coefficient (inv rew scale) batch_size=BATCH_SIZE, start_steps=10000, max_ep_len=PATH_LENGTH, # Max length for trajectory logger_kwargs=logger_kwargs, save_freq=1)
def experiment(variant): env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)] q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() # QF Plot plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['epoch_plotter'] = plotter # variant['algo_params']['epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( # MultiPolicySelector(data['u_policy'], args.un)) WeightedMultiPolicySelector(data['policy'], args.un)) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') policy = MakeDeterministic(data['policy']) else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['policy'], args.un) else: # policy = data['u_policies'][args.un] policy = WeightedMultiPolicySelector(data['policy'], args.un) else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params)) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size]) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = DDPG( explo_env=env, # training_env=env, save_environment=False, policy=policy, explo_policy=exploration_policy, qf=qf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(Reacher2D3DofObstacleEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = SQL( env=env, training_env=env, save_environment=False, qf=qf, policy=policy, # algo_interface='torch', **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def simulate_policy(args): data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] # env = data['env'] env = NormalizedBoxEnv(gym.make(args.env)) print("Environment loaded!!") # # Load environment # with open('variant.json') as json_data: # env_params = json.load(json_data)['env_params'] # env_params.pop('goal') # env_params['is_render'] = True # env = NormalizedBoxEnv(args.env(**env_params)) # print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() # else: # set_gpu_mode(False) # policy.cpu() if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: env.start_recording_video('prueba.mp4') path = rollout( env, policy, max_path_length=args.H, animated=True, # deterministic=args.deterministic, ) print('Accum reward is: ', path['rewards'].sum()) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: env.stop_recording_video() break
def load_env(render=True): env_params = dict( is_render=True, # obs_distances=False, obs_distances=True, obs_with_img=False, # obs_with_ori=True, active_joints='RA', control_mode='joint_tasktorque', # _control_mode='torque', balance_cost_weight=1.0, fall_cost_weight=1.0, tgt_cost_weight=3.0, # tgt_cost_weight=50.0, balance_done_cost= 0., # 2.0*PATH_LENGTH, # TODO: dont forget same balance weight tgt_done_reward=0., # 20.0, ctrl_cost_weight=1.0e-1, use_log_distances=True, log_alpha_pos=1e-4, log_alpha_ori=1e-4, goal_tolerance=0.05, min_obj_height=0.60, max_obj_height=1.20, max_obj_distance=0.20, max_time=None, sim_timestep=SIM_TIMESTEP, frame_skip=FRAME_SKIP, subtask=SUBTASK, random_init=True, seed=SEED, ) env = NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) return env
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv(Pusher2D3DofMultiGoalEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] # _i_policy = GaussianPolicy( policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) if ptu.gpu_enabled(): policy.cuda() replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = Reinforce(env=env, training_env=env, save_environment=False, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=False) return algorithm
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)] q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(PusherEnv(goal=variant['env_params'].get('goal'))) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) if ptu.gpu_enabled(): policy.cuda() # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=env, save_environment=False, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = Reinforce( env=env, # training_env=env, save_environment=False, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: qf2 = None if EXPLICIT_VF: vf = NNVFunction( obs_dim=obs_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['v_hidden_w_init'], output_w_init=variant['v_output_w_init'], ) else: vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, qf2=qf2, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Reacher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] i_qf2 = data['qf2'] u_qf = data['u_qf'] u_qf2 = data['u_qf2'] i_vf = data['i_vf'] u_vf = data['u_vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: u_qf2 = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_qf2 = None i_qf2 = None if EXPLICIT_VF: u_vf = NNMultiVFunction( obs_dim=obs_dim, n_vs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_vf = NNVFunction( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_vf = None i_vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUSAC( env=env, policy=policy, u_qf1=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf1=i_qf, u_qf2=u_qf2, i_qf2=i_qf2, u_vf=u_vf, i_vf=i_vf, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) # algorithm.pretrain(10000) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu']) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] u_qf = data['u_qf'] policy = data['policy'] exploration_policy = data['exploration_policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUDDPG(env=env, policy=policy, explo_policy=exploration_policy, u_qf=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf=i_qf, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def plot_q_fcn(i_qf, i_qf2, u_qf, u_qf2, obs, policy): # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: env_params = json.load(json_data)['env_params'] env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) # env.reset() # env.render() obs = np.array(obs) n_action_samples = 100 x_min, y_min = env.action_space.low x_max, y_max = env.action_space.high delta = 0.05 # xlim = (1.1*x_min, 1.1*x_max) # ylim = (1.1*y_min, 1.1*y_max) xlim = (1.0*x_min, 1.0*x_max) ylim = (1.0*y_min, 1.0*y_max) all_x = np.arange(x_min, x_max, delta) all_y = np.arange(y_min, y_max, delta) xy_mesh = np.meshgrid(all_x, all_y) all_acts = np.zeros((len(all_x)*len(all_y), 2)) all_acts[:, 0] = xy_mesh[0].ravel() all_acts[:, 1] = xy_mesh[1].ravel() n_unintentions = u_qf.n_heads if u_qf is not None else 0 def plot_q_contours(ax, values): values = values.reshape(len(all_x), len(all_y)) contours = ax.contour(xy_mesh[0], xy_mesh[1], values, 20, colors='dimgray') ax.clabel(contours, inline=1, fontsize=10, fmt='%.0f') ax.imshow(values, extent=(x_min, x_max, y_min, y_max), origin='lower', alpha=0.5) ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('Vel. X', fontweight='bold', fontsize=18) ax.set_ylabel('Vel. Y', fontweight='bold', fontsize=18) ax.axis('equal') ax.set_aspect('equal', 'box') ax.grid(False) def plot_action_samples(ax, actions): x, y = actions[:, 0], actions[:, 1] ax.scatter(x, y, c='b', marker='*', zorder=5) ax.set_xlim(xlim) ax.set_ylim(ylim) for ob in obs: all_obs = np.broadcast_to(ob, (all_acts.shape[0], 2)) fig, all_axs = \ subplots(1, n_unintentions + 1, gridspec_kw={'wspace': 0, 'hspace': 0}, ) # fig.suptitle('Q-val Observation: ' + str(ob)) fig.tight_layout() fig.canvas.set_window_title('q_vals_%1d_%1d' % (ob[0], ob[1])) all_axs = np.atleast_1d(all_axs) all_axs[0].set_title('Main Task', fontdict={'fontsize': 30, 'fontweight': 'medium'}) q_vals = i_qf.get_values(all_obs, all_acts)[0] if i_qf2 is not None: q2_vals = i_qf2.get_values(all_obs, all_acts)[0] q_vals = np.concatenate([q_vals, q2_vals], axis=1) q_vals = np.min(q_vals, axis=1, keepdims=True) plot_q_contours(all_axs[0], q_vals) if u_qf is None: pol_kwargs = dict( ) else: pol_kwargs = dict( pol_idx=None, ) # Compute and plot Main Task Q Value action_samples = policy.get_actions(all_obs[:n_action_samples, :], deterministic=False, **pol_kwargs )[0] plot_action_samples(all_axs[0], action_samples) all_axs[0].set_xticklabels([]) all_axs[0].set_yticklabels([]) for aa in range(n_unintentions): subgo_ax = all_axs[aa + 1] subgo_ax.set_title('Sub-Task %02d' % (aa+1), fontdict={'fontsize': 30, 'fontweight': 'medium'} ) q_vals = u_qf.get_values(all_obs, all_acts, val_idxs=[aa])[0] q_vals = q_vals[0] if u_qf2 is not None: q2_vals = u_qf2.get_values(all_obs, all_acts)[0] q2_vals = q2_vals[0] q_vals = np.concatenate([q_vals, q2_vals], axis=1) q_vals = np.min(q_vals, axis=1, keepdims=True) plot_q_contours(subgo_ax, q_vals) if u_qf is None: pol_kwargs = dict( ) else: pol_kwargs = dict( pol_idx=aa, ) # Compute and plot Sub-Task Q Value action_samples = policy.get_actions(all_obs[:n_action_samples, :], deterministic=False, **pol_kwargs )[0] plot_action_samples(subgo_ax, action_samples) subgo_ax.get_yaxis().set_visible(False) subgo_ax.set_xticklabels([])
def experiment(variant): render_q = variant['render_q'] save_q_path = '/home/desteban/logs/goalcompo_q_plots' ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']) ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional)] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional)] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim,) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot goal_pos = expt_variant['env_params']['goal_position'] q_fcn_positions = [ (goal_pos[0], 0.0), (0.0, 0.0), (0.0, goal_pos[1]) ] plotter = QFPolicyPlotter( i_qf=i_qf, i_policy=i_policy, u_qfs=u_qfs, u_policies=u_policies, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL( env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = PPO( env=env, policy=policy, qf=qf, # replay_buffer=replay_buffer, # batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: print('Using the deterministic version of the policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: print('Using the stochastic policy.') policy = data['exploration_policy'] print("Policy loaded!!") # Load environment with open('variant.json') as json_data: env_params = json.load(json_data)['env_params'] env_params['is_render'] = True env = NormalizedBoxEnv( Reacher2D3DofBulletEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('reacher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') path = rollout( env, policy, max_path_length=args.H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 5), (-5, 5)] q_fcn_positions = [(5, 5), (0, 0), (-5, 5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) # _i_policy = TanhGaussianPolicy( policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # QF Plot plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) for net in algorithm.torch_models: print(net) for pp in net.parameters(): print(pp.is_cuda) print('-----------') input('IIIII') if ptu.gpu_enabled(): algorithm.cuda() for net in algorithm.torch_models: print(net) for pp in net.parameters(): print(pp.is_cuda) input('YUIPION') algorithm.train() return algorithm
actuation_cost_coeff=0.5, distance_cost_coeff=1.5, log_distance_cost_coeff=0, #1.5, alpha=1e-6, # Initial Condition init_position=(-4., -4.), init_sigma=1.50, # Goal goal_position=(5., 5.), goal_threshold=0.25, # Others dynamics_sigma=0.1, # horizon=PATH_LENGTH, horizon=None, ) env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params)) for ii in range(5): env.reset() env.render() env.reset() env.render() # input('Press a key to start interacting...') for ii in range(50): action = env.action_space.sample() obs, reward, done, env_info = env.step(action) print('') print('---' * 3, ii, '---' * 3) print('action -->', action) print('obs -->', obs)
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) else: qf2 = None vf = NNVFunction( obs_dim=obs_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) # # Clamp model parameters # qf.clamp_all_params(min=-0.003, max=0.003) # vf.clamp_all_params(min=-0.003, max=0.003) # policy.clamp_all_params(min=-0.003, max=0.003) # if USE_Q2: # qf2.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, qf2=qf2, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): exploration_pol_id = 1 render_q = True variant['algo_params']['exploration_pol_id'] = exploration_pol_id save_q_path = '/home/desteban/logs/two_q_plots%d' % exploration_pol_id goal_positions = [(5, 5), (-5, 5)] q_fcn_positions = [(5, 5), (0, 0), (-5, 5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] # qfs = [FlattenMlp( # hidden_sizes=[net_size, net_size], # input_size=obs_dim + action_dim, # output_size=1) for _ in range(n_demons)] qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_demons) ] if ptu.gpu_enabled(): for qf in qfs: qf.cuda() policies = [ StochasticPolicy(hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim) for _ in range(n_demons) ] if ptu.gpu_enabled(): for policy in policies: policy.cuda() replay_buffer = MultiEnvReplayBuffer( variant['algo_params']['replay_buffer_size'], env, reward_vector_size=n_demons, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot plotter = QFPolicyPlotter( qf=qfs, policy=policies, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, u_qfs=qfs, u_policies=policies, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print('Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un) ) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params.pop('goal', None) env_params['is_render'] = True if args.subtask and args.un != -1: env_params['subtask'] = args.un env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('pusher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) # plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print( 'Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un)) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params['is_render'] = True if 'obs_mean' in data.keys(): obs_mean = data['obs_mean'] print('OBS_MEAN') print(repr(obs_mean)) else: obs_mean = None # obs_mean = np.array([ 0.07010766, 0.37585765, 0.21402615, 0.24426296, 0.5789634 , # 0.88510203, 1.6878743 , 0.02656335, 0.03794186, -1.0241051 , # -0.5226027 , 0.6198239 , 0.49062446, 0.01197532, 0.7888951 , # -0.4857273 , 0.69160587, -0.00617676, 0.08966777, -0.14694819, # 0.9559917 , 1.0450271 , -0.40958315, 0.86435956, 0.00609685, # -0.01115279, -0.21607827, 0.9762933 , 0.80748135, -0.48661205, # 0.7473679 , 0.01649722, 0.15451911, -0.17285274, 0.89978695]) if 'obs_var' in data.keys(): obs_var = data['obs_var'] print('OBS_VAR') print(repr(obs_var)) else: obs_var = None # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407 , 0.8994803 , # 0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 , # 0.9462659 , 2.245269 , 0.84190637, 1.5407104 , 0.1 , # 0.10330457, 0.1 , 0.1 , 0.1 , 0.1528581 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , # 0.1 , 0.1 , 0.1 , 0.1 , 0.12320185, # 0.1 , 0.18369523, 0.200373 , 0.11895574, 0.15118493]) print(env_params) if args.subtask and args.un != -1: env_params['subtask'] = args.un # else: # env_params['subtask'] = None env = NormalizedBoxEnv( CentauroTrayEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('centauro_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break