def experiment(variant): ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): qf.cuda() policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)] q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() # QF Plot plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['epoch_plotter'] = plotter # variant['algo_params']['epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv(Pusher2D3DofMultiGoalEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = SamplingPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) if ptu.gpu_enabled(): policy.cuda() replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = SQL( env=env, training_env=env, save_environment=False, qf=qf, policy=policy, # algo_interface='torch', **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def experiment(variant): env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(Pusher2D3DofGoalCompoEnv(**variant['env_params'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional) ] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [ StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional) ] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train(online=True) return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size]) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = DDPG( explo_env=env, # training_env=env, save_environment=False, policy=policy, explo_policy=exploration_policy, qf=qf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)] q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(PusherEnv(goal=variant['env_params'].get('goal'))) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) if ptu.gpu_enabled(): qf.cuda() # _i_policy = TanhGaussianPolicy( policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) if ptu.gpu_enabled(): policy.cuda() # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) # env = NormalizedBoxEnv( # Reacher2D3DofBulletEnv(**variant['env_params']) # ) env = Reacher2D3DofBulletEnv(**variant['env_params']) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) initial_conds = [ [10, 5, 20, 0.2, 0.5, 0], [10, 5, 20, 0.1, 0.1, 0], [10, 5, 20, 0.15, 0.8, 0], ] for init_cond in initial_conds: env.add_initial_condition(robot_config=np.deg2rad(init_cond[:3]), tgt_state=init_cond[-3:]) net_size = variant['net_size'] # global_policy = TanhGaussianPolicy( global_policy = MlpPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) local_policies = [ LinearGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, T=PATH_LENGTH, ) for _ in range(N_LOCAL_POLS) ] # # replay_buffer = FakeReplayBuffer() # variant['algo_params']['replay_buffer'] = replay_buffer # # # QF Plot # # variant['algo_params']['epoch_plotter'] = None algorithm = MDGPS(env=env, eval_env=env, save_environment=False, local_policies=local_policies, global_policy=global_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=env, save_environment=False, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def forward( self, obs, deterministic=False, ): """ :param obs: Observation :param deterministic: If True, do not sample """ # TODO: HOW TO DETERMINISTIC??? latent_shape = (*list(obs.shape)[:-1], self._action_dim) if deterministic: latent = torch.zeros(latent_shape) else: latent = self._latent_dist.sample(latent_shape).squeeze(-1) if ptu.gpu_enabled(): latent = latent.cuda() h = torch.cat([obs, latent], dim=-1) # print('--- INPUT ---') # print(torch.cat([obs, latent], dim=-1)[:5, :]) for i, fc in enumerate(self.fcs): # h = self.hidden_activation(fc(h)) h = fc(h) if self.layer_norm and i < len(self.fcs) - 1: h = self.layer_norms[i](h) h = self.hidden_activation(h) action = self.last_fc(h) if self._squash: action = torch.tanh(action) info_dict = dict() return action, info_dict
def experiment(variant): ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv(gym.make(variant['env_name'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) replay_buffer = SimpleReplayBuffer( variant['algo_params']['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot # variant['algo_params']['epoch_plotter'] = None algorithm = Reinforce( env=env, # training_env=env, save_environment=False, policy=policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] i_qf2 = data['qf2'] u_qf = data['u_qf'] u_qf2 = data['u_qf2'] i_vf = data['i_vf'] u_vf = data['u_vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: u_qf2 = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_qf2 = None i_qf2 = None if EXPLICIT_VF: u_vf = NNMultiVFunction( obs_dim=obs_dim, n_vs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_vf = NNVFunction( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: u_vf = None i_vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUSAC( env=env, policy=policy, u_qf1=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf1=i_qf, u_qf2=u_qf2, i_qf2=i_qf2, u_vf=u_vf, i_vf=i_vf, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) # algorithm.pretrain(10000) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu']) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim n_unintentional = 2 if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] i_qf = data['qf'] u_qf = data['u_qf'] policy = data['policy'] exploration_policy = data['exploration_policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] u_qf = NNMultiQFunction( obs_dim=obs_dim, action_dim=action_dim, n_qs=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) i_qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, n_policies=n_unintentional, hidden_activation=variant['hidden_activation'], # shared_hidden_sizes=[net_size, net_size], shared_hidden_sizes=[net_size], # shared_hidden_sizes=[], unshared_hidden_sizes=[net_size, net_size], unshared_mix_hidden_sizes=[net_size, net_size], stds=None, input_norm=variant['input_norm'], shared_layer_norm=variant['shared_layer_norm'], policies_layer_norm=variant['policies_layer_norm'], mixture_layer_norm=variant['mixture_layer_norm'], mixing_temperature=1., softmax_weights=variant['softmax_weights'], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) if INIT_AVG_MIXING: set_average_mixing( policy, n_unintentional, obs_dim, batch_size=50, total_iters=1000, ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = MultiGoalReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, reward_vector_size=n_unintentional, ) algorithm = HIUDDPG(env=env, policy=policy, explo_policy=exploration_policy, u_qf=u_qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, i_qf=i_qf, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) env = NormalizedBoxEnv( CentauroTrayEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = TanhMlpPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): exploration_pol_id = 1 render_q = True variant['algo_params']['exploration_pol_id'] = exploration_pol_id save_q_path = '/home/desteban/logs/two_q_plots%d' % exploration_pol_id goal_positions = [(5, 5), (-5, 5)] q_fcn_positions = [(5, 5), (0, 0), (-5, 5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] # qfs = [FlattenMlp( # hidden_sizes=[net_size, net_size], # input_size=obs_dim + action_dim, # output_size=1) for _ in range(n_demons)] qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_demons) ] if ptu.gpu_enabled(): for qf in qfs: qf.cuda() policies = [ StochasticPolicy(hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim) for _ in range(n_demons) ] if ptu.gpu_enabled(): for policy in policies: policy.cuda() replay_buffer = MultiEnvReplayBuffer( variant['algo_params']['replay_buffer_size'], env, reward_vector_size=n_demons, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot plotter = QFPolicyPlotter( qf=qfs, policy=policies, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, u_qfs=qfs, u_policies=policies, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): render_q = variant['render_q'] save_q_path = '/home/desteban/logs/goalcompo_q_plots' ptu.set_gpu_mode(variant['gpu']) env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']) ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) n_unintentional = 2 net_size = variant['net_size'] u_qfs = [NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_unintentional)] # i_qf = AvgNNQFunction(obs_dim=obs_dim, i_qf = SumNNQFunction(obs_dim=obs_dim, action_dim=action_dim, q_functions=u_qfs) # _i_policy = TanhGaussianPolicy( u_policies = [StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) for _ in range(n_unintentional)] i_policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim,) replay_buffer = MultiGoalReplayBuffer( variant['algo_params']['replay_buffer_size'], np.prod(env.observation_space.shape), np.prod(env.action_space.shape), n_unintentional ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot goal_pos = expt_variant['env_params']['goal_position'] q_fcn_positions = [ (goal_pos[0], 0.0), (0.0, 0.0), (0.0, goal_pos[1]) ] plotter = QFPolicyPlotter( i_qf=i_qf, i_policy=i_policy, u_qfs=u_qfs, u_policies=u_policies, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL( env=env, training_env=env, save_environment=False, u_qfs=u_qfs, u_policies=u_policies, i_policy=i_policy, i_qf=i_qf, algo_interface='torch', min_buffer_size=variant['algo_params']['batch_size'], **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) np.random.seed(SEED) ptu.set_gpu_mode(variant['gpu']) ptu.seed(SEED) goal = variant['env_params'].get('goal') variant['env_params']['goal_poses'] = \ [goal, (goal[0], 'any'), ('any', goal[1])] variant['env_params'].pop('goal') env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) if variant['log_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size] ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[net_size, net_size], ) # Clamp model parameters qf.clamp_all_params(min=-0.003, max=0.003) policy.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_replay_buffer_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = PPO( env=env, policy=policy, qf=qf, # replay_buffer=replay_buffer, # batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # algorithm.pretrain(PATH_LENGTH*2) algorithm.train(start_epoch=start_epoch) return algorithm
env = create_environment() # # for cc in range(env.n_init_conds): # env.reset(condition=cc) # print(cc) # input('wuuu') cost_fcn = create_cost_fcn(env) local_policies, global_policy = create_policies(env) mdgps_algo = create_algo(env, local_policies, global_policy, cost_fcn) # if ptu.gpu_enabled(): # mdgps_algo.cuda() if ptu.gpu_enabled(): global_policy.cuda() start_epoch = 0 mdgps_algo.train(start_epoch=start_epoch) # action_dim = env.action_dim # obs_dim = env.obs_dim # state_dim = env.state_dim # # print(action_dim, obs_dim, state_dim) # # fake_sample = dict( # actions=np.random.rand(10, action_dim), # observations=np.random.rand(10, obs_dim) # )
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Reacher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] raise NotImplementedError else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) es = OUStrategy( action_space=env.action_space, mu=0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DDPG( explo_env=env, policy=policy, explo_policy=exploration_policy, qf=qf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): exploration_pol_id = 1 variant['algo_params']['exploration_pol_id'] = exploration_pol_id n_demons = 2 ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( Reacher2D3DofObstacleEnv(is_render=False, obs_with_img=False, rdn_tgt_pose=True, sim_timestep=0.001, frame_skip=10, obs_distances=True, tgt_cost_weight=1.0, obst_cost_weight=3.0, ctrl_cost_weight=1.0e-2, safe_radius=0.15, inside_cost=1, outside_cost=0)) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] # qfs = [FlattenMlp( # hidden_sizes=[net_size, net_size], # input_size=obs_dim + action_dim, # output_size=1) for _ in range(n_demons)] qfs = [ NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) for _ in range(n_demons) ] if ptu.gpu_enabled(): for qf in qfs: qf.cuda() policies = [ StochasticPolicy(hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim) for _ in range(n_demons) ] if ptu.gpu_enabled(): for policy in policies: policy.cuda() replay_buffer = MultiEnvReplayBuffer( variant['algo_params']['replay_buffer_size'], env, reward_vector_size=n_demons, ) variant['algo_params']['replay_buffer'] = replay_buffer # QF Plot variant['algo_params']['_epoch_plotter'] = None algorithm = IUSQL(env=env, u_qfs=qfs, u_policies=policies, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Pusher2D3DofGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) else: qf2 = None vf = NNVFunction( obs_dim=obs_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=expt_params['hidden_activation'], hidden_sizes=[net_size, net_size], ) # # Clamp model parameters # qf.clamp_all_params(min=-0.003, max=0.003) # vf.clamp_all_params(min=-0.003, max=0.003) # policy.clamp_all_params(min=-0.003, max=0.003) # if USE_Q2: # qf2.clamp_all_params(min=-0.003, max=0.003) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, qf2=qf2, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS) # Set seeds np.random.seed(variant['seed']) ptu.set_gpu_mode(variant['gpu'], gpu_id=0) ptu.seed(variant['seed']) variant['env_params']['seed'] = variant['seed'] env = NormalizedBoxEnv( Navigation2dGoalCompoEnv(**variant['env_params']), # normalize_obs=True, normalize_obs=False, online_normalization=False, obs_mean=None, obs_var=None, obs_alpha=0.001, ) obs_dim = env.obs_dim action_dim = env.action_dim if variant['load_dir']: params_file = os.path.join(variant['log_dir'], 'params.pkl') data = joblib.load(params_file) start_epoch = data['epoch'] qf = data['qf'] qf2 = data['qf2'] vf = data['vf'] policy = data['policy'] env._obs_mean = data['obs_mean'] env._obs_var = data['obs_var'] else: start_epoch = 0 net_size = variant['net_size'] qf = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) if USE_Q2: qf2 = NNQFunction( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['q_hidden_w_init'], output_w_init=variant['q_output_w_init'], ) else: qf2 = None if EXPLICIT_VF: vf = NNVFunction( obs_dim=obs_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['v_hidden_w_init'], output_w_init=variant['v_output_w_init'], ) else: vf = None policy = POLICY( obs_dim=obs_dim, action_dim=action_dim, hidden_activation=variant['hidden_activation'], hidden_sizes=[net_size, net_size, net_size], hidden_w_init=variant['pol_hidden_w_init'], output_w_init=variant['pol_output_w_init'], ) replay_buffer = SimpleReplayBuffer( max_size=variant['replay_buffer_size'], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SAC(explo_env=env, policy=policy, qf=qf, qf2=qf2, vf=vf, replay_buffer=replay_buffer, batch_size=BATCH_SIZE, eval_env=env, save_environment=False, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda(ptu.device) algorithm.pretrain(variant['steps_pretrain']) algorithm.train(start_epoch=start_epoch) return algorithm
def experiment(variant): render_q = True save_q_path = '/home/desteban/logs/two_q_plots' goal_positions = [(5, 5), (-5, 5)] q_fcn_positions = [(5, 5), (0, 0), (-5, 5)] n_demons = len(goal_positions) ptu._use_gpu = variant['gpu'] env = NormalizedBoxEnv( MultiCompositionEnv( actuation_cost_coeff=30, distance_cost_coeff=1.0, goal_reward=10, init_sigma=0.1, goal_positions=goal_positions, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = NNQFunction(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=(net_size, net_size)) # _i_policy = TanhGaussianPolicy( policy = StochasticPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # QF Plot plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=q_fcn_positions, default_action=[np.nan, np.nan], n_samples=100, render=render_q, save_path=save_q_path, ) variant['algo_params']['_epoch_plotter'] = plotter # variant['algo_params']['_epoch_plotter'] = None algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params']) for net in algorithm.torch_models: print(net) for pp in net.parameters(): print(pp.is_cuda) print('-----------') input('IIIII') if ptu.gpu_enabled(): algorithm.cuda() for net in algorithm.torch_models: print(net) for pp in net.parameters(): print(pp.is_cuda) input('YUIPION') algorithm.train() return algorithm