def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(Pusher2D3DofGoalCompoEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional) ] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [ StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional) ] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def simulate_policy(args): np.random.seed(SEED) ptu.seed(SEED) data = joblib.load(args.file) if args.deterministic: if args.un > -1: print('Using the deterministic version of the UNintentional policy ' '%02d.' % args.un) if 'u_policy' in data: policy = MakeDeterministic( MultiPolicySelector(data['u_policy'], args.un)) # WeightedMultiPolicySelector(data['u_policy'], args.un)) else: # policy = MakeDeterministic(data['u_policies'][args.un]) if isinstance(data['policy'], TanhGaussianPolicy): policy = MakeDeterministic(data['policy']) else: policy = MakeDeterministic( WeightedMultiPolicySelector(data['policy'], args.un) ) else: print('Using the deterministic version of the Intentional policy.') if isinstance(data['policy'], ExplorationPolicy): policy = MakeDeterministic(data['policy']) else: policy = data['policy'] else: if args.un > -1: print('Using the UNintentional stochastic policy %02d' % args.un) if 'u_policy' in data: # policy = MultiPolicySelector(data['u_policy'], args.un) policy = WeightedMultiPolicySelector(data['u_policy'], args.un) else: policy = WeightedMultiPolicySelector(data['policy'], args.un) # policy = data['policy'][args.un] else: print('Using the Intentional stochastic policy.') # policy = data['exploration_policy'] policy = data['policy'] print("Policy loaded!!") # Load environment dirname = os.path.dirname(args.file) with open(os.path.join(dirname, 'variant.json')) as json_data: log_data = json.load(json_data) env_params = log_data['env_params'] H = int(log_data['path_length']) env_params.pop('goal', None) env_params['is_render'] = True if args.subtask and args.un != -1: env_params['subtask'] = args.un env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**env_params), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) print("Environment loaded!!") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, MakeDeterministic): if isinstance(policy.stochastic_policy, PyTorchModule): policy.stochastic_policy.train(False) else: if isinstance(policy, PyTorchModule): policy.train(False) while True: if args.record: rollout_start_fcn = lambda: \ env.start_recording_video('pusher_video.mp4') rollout_end_fcn = lambda: \ env.stop_recording_video() else: rollout_start_fcn = None rollout_end_fcn = None obs_normalizer = data.get('obs_normalizer') if args.H != -1: H = args.H path = rollout( env, policy, max_path_length=H, animated=True, obs_normalizer=obs_normalizer, rollout_start_fcn=rollout_start_fcn, rollout_end_fcn=rollout_end_fcn, ) # plot_rollout_reward(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.record: break
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu']) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] u_qf = data['u_qf'] policy = data['policy'] exploration_policy = data['exploration_policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUDDPG(env=env, policy=policy, explo_policy=exploration_policy, u_qf=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf=i_qf, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = PPO( env=env, policy=policy, qf=qf, # replay_buffer=replay_buffer, # batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) else: qf2 = None vf = NNVFunction( obs_dim=obs_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) # # Clamp model parameters # qf.clamp_all_params(min=-0.003, max=0.003) # vf.clamp_all_params(min=-0.003, max=0.003) # policy.clamp_all_params(min=-0.003, max=0.003) # if USE_Q2: # qf2.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, qf2=qf2, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size]) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG(explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm