def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) self.context_encoder = ProbabilisticContextEncoder( mlp_enconder, variant['latent_dim']) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.use_next_obs_in_context = variant['use_next_obs_in_context'] self.env = env_producer(variant['domain'], variant['seed']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length'] self.vae_latent_dim = vae_latent_dim self.candidate_size = variant['candidate_size'] self.env.seed(10 * variant['seed'] + 1234 + index) set_seed(10 * variant['seed'] + 1234 + index) self.env.action_space.np_random.seed(123 + index)
def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys env_max_action = variant['env_max_action'] obs_dim = variant['obs_dim'] action_dim = variant['action_dim'] latent_dim = variant['latent_dim'] vae_latent_dim = 2 * action_dim self.f = MlpEncoder( g_hidden_sizes=variant['g_hidden_sizes'], g_input_sizes=obs_dim + action_dim + 1, g_latent_dim=variant['g_latent_dim'], h_hidden_sizes=variant['h_hidden_sizes'], latent_dim=latent_dim, ) self.Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) self.vae_decoder = VaeDecoder( max_action=variant['env_max_action'], hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + latent_dim, output_size=action_dim, ) self.perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + latent_dim, output_size=action_dim, ) self.env = env_producer(variant['domain'], variant['seed']) self.num_evals = variant['algo_params']['num_evals'] self.max_path_length = variant['max_path_length'] self.vae_latent_dim = vae_latent_dim self.num_trans_context = variant['num_trans_context'] self.candidate_size = variant['candidate_size'] self.seed = variant['seed'] self.index = index self.env.seed(10 * self.seed + 1234 + index) set_seed(10 * self.seed + 1234 + index)
def experiment(variant, bcq_buffers, prev_exp_state=None): # Create the multitask replay buffer based on the buffer list train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) # create multi-task environment and sample tasks env = env_producer(variant['domain'], variant['seed']) env.reset() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # instantiate networks network_ensemble = [] for _ in range(variant['num_network_ensemble']): P = FlattenMlp( hidden_sizes=variant['P_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) network_ensemble.append(P) trainer = SuperQTrainer( env, network_ensemble=network_ensemble, train_goal=variant['train_goal'], std_threshold=variant['std_threshold'], domain=variant['domain'], ) algorithm = BatchMetaRLAlgorithm( trainer, train_buffer, **variant['algo_params'], ) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def __init__(self, variant, goal, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys self.env = env_producer(variant['env_name'], seed=0, goal=goal) obs_dim = int(np.prod(self.env.observation_space.shape)) action_dim = int(np.prod(self.env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) self.agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length']
def __init__(self, index, variant, candidate_size=10): ptu.set_gpu_mode(True) torch.set_num_threads(1) import sys sys.argv = [''] del sys self.env = env_producer(variant['domain'], variant['seed']) state_dim = self.env.observation_space.low.size action_dim = self.env.action_space.low.size max_action = float(self.env.action_space.high[0]) self.policy = BCQ(state_dim, action_dim, max_action, **variant['policy_params']) self.num_evals = variant['num_evals'] self.max_path_length = variant['max_path_length'] self.seed = variant['seed'] self.index = index self.env.seed(10 * self.seed + 1234 + index) set_seed(10 * self.seed + 1234 + index)
def __init__( self, domain_name, env_seed, policy_producer, max_num_epoch_paths_saved=None, render=False, render_kwargs=None, ): torch.set_num_threads(1) env = env_producer(domain_name, env_seed) self._policy_producer = policy_producer super().__init__( env, max_num_epoch_paths_saved=max_num_epoch_paths_saved, render=render, render_kwargs=render_kwargs, )
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') # Get producer function for policy and value functions M = variant['layer_size'] q_producer = get_q_producer( obs_dim, action_dim, hidden_sizes=[1024, 1024, 1024, 1024, 1024, 1024, 1024]) policy_producer = get_policy_producer(obs_dim, action_dim, hidden_sizes=[M, M]) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, q_producer, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) if prev_exp_state is not None: expl_path_collector.restore_from_snapshot( prev_exp_state['exploration']) ray.get([ remote_eval_path_collector.restore_from_snapshot.remote( prev_exp_state['evaluation_remote']) ]) ray.get([ remote_eval_path_collector.set_global_pkg_rng_state.remote( prev_exp_state['evaluation_remote_rng_state']) ]) replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer']) trainer.restore_from_snapshot(prev_exp_state['trainer']) set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state']) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) env_max_action = float(expl_env.action_space.high[0]) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') # Network module from tiMe mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) context_encoder = ProbabilisticContextEncoder(mlp_enconder, variant['latent_dim']) qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) vae_decoder = VaeDecoder( max_action=env_max_action, hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + variant['latent_dim'], output_size=action_dim, ) perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=action_dim, ) # Load the params obtained by tiMe ss = load_gzip_pickle(variant['path_to_snapshot']) ss = ss['trainer'] encoder_state_dict = OrderedDict() for key, value in ss['context_encoder_state_dict'].items(): if 'mlp_encoder' in key: encoder_state_dict[key.replace('mlp_encoder.', '')] = value mlp_enconder.load_state_dict(encoder_state_dict) qf1.load_state_dict(ss['Qs_state_dict']) target_qf1.load_state_dict(ss['Qs_state_dict']) qf2.load_state_dict(ss['Qs_state_dict']) target_qf2.load_state_dict(ss['Qs_state_dict']) vae_decoder.load_state_dict(ss['vae_decoder_state_dict']) perturbation_generator.load_state_dict(ss['perturbation_generator_dict']) tiMe_path_collector = tiMeSampler( expl_env, context_encoder, qf1, vae_decoder, perturbation_generator, vae_latent_dim=vae_latent_dim, candidate_size=variant['candidate_size'], ) tiMe_path_collector.to(ptu.device) # Get producer function for policy policy_producer = get_policy_producer( obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes']) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, tiMe_data_collector=tiMe_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
# wd_goals = np.random.uniform(0, 1, size=(8,)) * np.pi * 2 / 3 # ood_goals = np.random.uniform(2 / 3, 1.0, size=(8,)) * np.pi # idx_list = [0, 1, 4, 10, 12, 14, 17, 21, 26, 27] # train_goals = train_goals[idx_list] # filename = './goals/humanoid-openai-dir-normal-goals.pkl' # with open(filename, 'wb') as f: # pickle.dump([idx_list, train_goals, wd_goals, ood_goals], f) # print([idx_list, train_goals, wd_goals, ood_goals]) # #---------------------Walker-Param-Normal------------------------- sample_env = env_producer('walker-param', 0) train_goals = sample_env.sample_tasks(30, is_train=True, is_within_distribution=True) wd_goals = sample_env.sample_tasks(8, is_train=False, is_within_distribution=True) ood_goals = sample_env.sample_tasks(8, is_train=False, is_within_distribution=False) idx_list = list(range(30)) train_goals = [train_goals[idx] for idx in idx_list] filename = './goals/walker-param-normal-goals.pkl' with open(filename, 'wb') as f:
def experiment(variant): domain = variant['domain'] seed = variant['seed'] exp_mode = variant['exp_mode'] max_path_length = variant['algo_params']['max_path_length'] bcq_interactions = variant['bcq_interactions'] num_tasks = variant['num_tasks'] filename = f'./goals/{domain}-{exp_mode}-goals.pkl' idx_list, train_goals, wd_goals, ood_goals = pickle.load( open(filename, 'rb')) idx_list = idx_list[:num_tasks] sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}" buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir) print("Buffer directory: " + buffer_dir) # Load buffer bcq_buffers = [] buffer_loader_id_list = [] for i, idx in enumerate(idx_list): bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl' filename = os.path.join(buffer_dir, bname) rp_buffer = ReplayBuffer.remote( index=i, seed=seed, num_trans_context=variant['num_trans_context'], in_mdp_batch_size=variant['in_mdp_batch_size'], ) buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename)) bcq_buffers.append(rp_buffer) ray.get(buffer_loader_id_list) assert len(bcq_buffers) == len(idx_list) train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) set_seed(variant['seed']) # create multi-task environment and sample tasks env = env_producer(variant['domain'], seed=0) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic(env=env, train_goals=train_goals, wd_goals=wd_goals, ood_goals=ood_goals, replay_buffers=train_buffer, nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['domain'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
return timestamp std_threshold = 0.1 in_mdp_batch_size = 128 eval_statistics = OrderedDict() logger.reset() setup_logger( log_dir=osp.join('./tune_threshold_loggings', create_simple_exp_name())) filename = f'./goals/ant-dir-normal-goals.pkl' train_goals, wd_goals, ood_goals = pickle.load(open(filename, 'rb')) env = env_producer('ant-dir', 0, train_goals[0]) for epoch in range(200): file_name = osp.join('./data_reward_predictions', f'params_{epoch}.pkl') params = pickle.load(open(file_name, "rb")) obs = params['obs'] actions = params['actions'] rewards = params['rewards'] pred_rewards = params['pred_rewards'] obs_other_tasks = [ obs[in_mdp_batch_size * i:in_mdp_batch_size * (i + 1)] for i in range(1, 32) ]
def experiment(variant, bcq_policies, bcq_buffers, ensemble_params_list, prev_exp_state=None): # Create the multitask replay buffer based on the buffer list train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) # create multi-task environment and sample tasks env = env_producer(variant['domain'], variant['seed']) env_max_action = float(env.action_space.high[0]) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 variant['env_max_action'] = env_max_action variant['obs_dim'] = obs_dim variant['action_dim'] = action_dim variant['mlp_enconder_input_size'] = mlp_enconder_input_size # instantiate networks mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) context_encoder = ProbabilisticContextEncoder(mlp_enconder, variant['latent_dim']) ensemble_predictor = EnsemblePredictor(ensemble_params_list) Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) vae_decoder = VaeDecoder( max_action=env_max_action, hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + variant['latent_dim'], output_size=action_dim, ) perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=action_dim, ) trainer = SuperQTrainer( ensemble_predictor=ensemble_predictor, num_network_ensemble=variant['num_network_ensemble'], bcq_policies=bcq_policies, std_threshold=variant['std_threshold'], is_combine=variant['is_combine'], nets=[context_encoder, Qs, vae_decoder, perturbation_generator]) path_collector = RemotePathCollector(variant) algorithm = BatchMetaRLAlgorithm( trainer, path_collector, train_buffer, **variant['algo_params'], ) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 # Log the variant logger.log("Variant:") logger.log(json.dumps(dict_to_safe_json(variant), indent=2)) algorithm.train(start_epoch)
# set up logger variant['log_dir'] = get_log_dir(variant) logger.reset() setup_logger(log_dir=variant['log_dir'], snapshot_gap=100, snapshot_mode="gap") logger.log(f'Seed: {seed}') set_seed(seed) logger.log(f'Using GPU: {True}') set_gpu_mode(mode=True, gpu_id=0) # Get the information of the environment env = env_producer(domain, seed) state_dim = env.observation_space.low.size action_dim = env.action_space.low.size max_action = float(env.action_space.high[0]) # Load buffer bcq_buffers = [] buffer_loader_id_list = [] for i, idx in enumerate(idx_list): bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl' filename = os.path.join(buffer_dir, bname) rp_buffer = ReplayBuffer.remote( index=i, seed=seed,
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] expl_env = env_producer(domain, seed) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size obs_dim, action_dim = { 'GridGoal1': (2, 2), 'GridGoal2': (2, 2), 'GridGoal3': (2, 2), 'AntEscape': (29, 8), 'AntJump': (29, 8), 'AntNavigate': (29, 8), 'HumanoidUp': (47, 17) }[domain] # Get producer function for policy and value functions M = variant['layer_size'] q_producer = get_q_producer(obs_dim, action_dim, hidden_sizes=[M, M]) policy_producer = get_policy_producer(obs_dim, action_dim, hidden_sizes=[M, M]) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_dim=obs_dim, ac_dim=action_dim) trainer = SACTrainer(policy_producer, q_producer, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], log_dir=variant['log_dir'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) if prev_exp_state is not None: expl_path_collector.restore_from_snapshot( prev_exp_state['exploration']) ray.get([ remote_eval_path_collector.restore_from_snapshot.remote( prev_exp_state['evaluation_remote']) ]) ray.get([ remote_eval_path_collector.set_global_pkg_rng_state.remote( prev_exp_state['evaluation_remote_rng_state']) ]) replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer']) trainer.restore_from_snapshot(prev_exp_state['trainer']) set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state']) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim, output_size=1, ) # Get producer function for policy policy_producer = get_policy_producer( obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes']) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)