def experiment(variant): # from softlearning.environments.gym import register_image_reach # register_image_reach() # env = gym.envs.make( # 'Pusher2d-ImageReach-v0', # ) from softlearning.environments.gym.mujoco.image_pusher_2d import ( ImageForkReacher2dEnv) env_kwargs = { 'image_shape': (32, 32, 3), 'arm_goal_distance_cost_coeff': 0.0, 'arm_object_distance_cost_coeff': 1.0, 'goal': (0, -1), } eval_env = ImageForkReacher2dEnv(**env_kwargs) expl_env = ImageForkReacher2dEnv(**env_kwargs) input_width, input_height, input_channels = eval_env.image_shape image_dim = input_width * input_height * input_channels action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=input_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( Split(qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size + non_image_dim) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( Split(target_qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size + non_image_dim) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn, output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( Split(policy_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = ENV_PARAMS[variant['env']] env_mod_params = variant['env_mod'] variant.update(env_params) expl_env = NormalizedBoxEnv(variant['env_class'](env_mod_params)) eval_env = NormalizedBoxEnv(variant['env_class']({})) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, policy, ) algorithm = TorchBatchRLAlgorithmModEnv( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], mod_env_epoch_schedule=variant['mod_env_epoch_schedule'], env_mod_dist=variant['mod_env_dist'], env_class=variant['env_class'], env_mod_params=variant['env_mod']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant.get("pretrained_algorithm_path", False): resume(variant) return normalize_env = variant.get("normalize_env", True) env_id = variant.get("env_id", None) env_params = ENV_PARAMS.get(env_id, {}) variant.update(env_params) env_class = variant.get("env_class", None) env_kwargs = variant.get("env_kwargs", {}) expl_env = make(env_id, env_class, env_kwargs, normalize_env) eval_env = make(env_id, env_class, env_kwargs, normalize_env) if variant.get("add_env_demos", False): variant["path_loader_kwargs"]["demo_paths"].append( variant["env_demo_path"]) if variant.get("add_env_offpolicy_data", False): variant["path_loader_kwargs"]["demo_paths"].append( variant["env_offpolicy_data_path"]) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, "info_sizes"): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() qf_kwargs = variant.get("qf_kwargs", {}) qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant["policy_kwargs"] policy_path = variant.get("policy_path", False) if policy_path: policy = load_local_or_remote_file(policy_path) else: policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) buffer_policy_path = variant.get("buffer_policy_path", False) if buffer_policy_path: buffer_policy = load_local_or_remote_file(buffer_policy_path) else: buffer_policy_class = variant.get("buffer_policy_class", policy_class) buffer_policy = buffer_policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant.get("buffer_policy_kwargs", policy_kwargs), ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_policy = policy exploration_kwargs = variant.get("exploration_kwargs", {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == "ou": es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs["noise"], min_sigma=exploration_kwargs["noise"], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == "gauss_eps": es = GaussianAndEpsilonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs["noise"], min_sigma=exploration_kwargs["noise"], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error main_replay_buffer_kwargs = dict( max_replay_buffer_size=variant["replay_buffer_size"], env=expl_env, ) replay_buffer_kwargs = dict( max_replay_buffer_size=variant["replay_buffer_size"], env=expl_env, ) replay_buffer = variant.get("replay_buffer_class", EnvReplayBuffer)(**main_replay_buffer_kwargs, ) if variant.get("use_validation_buffer", False): train_replay_buffer = replay_buffer validation_replay_buffer = variant.get( "replay_buffer_class", EnvReplayBuffer)(**main_replay_buffer_kwargs, ) replay_buffer = SplitReplayBuffer(train_replay_buffer, validation_replay_buffer, 0.9) trainer_class = variant.get("trainer_class", AWACTrainer) trainer = trainer_class( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant["trainer_kwargs"], ) if variant["collection_mode"] == "online": expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) else: expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) if variant.get("save_video", False): if variant.get("presampled_goals", None): variant["image_env_kwargs"][ "presampled_goals"] = load_local_or_remote_file( variant["presampled_goals"]).item() def get_img_env(env): renderer = EnvRenderer(**variant["renderer_kwargs"]) img_env = InsertImageEnv(GymToMultiEnv(env), renderer=renderer) image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"]) # image_eval_env = get_img_env(eval_env) image_eval_path_collector = ObsDictPathCollector( image_eval_env, eval_policy, observation_key="state_observation", ) image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"]) # image_expl_env = get_img_env(expl_env) image_expl_path_collector = ObsDictPathCollector( image_expl_env, expl_policy, observation_key="state_observation", ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) if variant.get("save_paths", False): algorithm.post_train_funcs.append(save_paths) if variant.get("load_demos", False): path_loader_class = variant.get("path_loader_class", MDPPathLoader) path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs, ) path_loader.load_demos() if variant.get("load_env_dataset_demos", False): path_loader_class = variant.get("path_loader_class", HDF5PathLoader) path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs, ) path_loader.load_demos(expl_env.get_dataset()) if variant.get("save_initial_buffers", False): buffers = dict( replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, ) buffer_path = osp.join(logger.get_snapshot_dir(), "buffers.p") pickle.dump(buffers, open(buffer_path, "wb")) if variant.get("pretrain_buffer_policy", False): trainer.pretrain_policy_with_bc( buffer_policy, replay_buffer.train_replay_buffer, replay_buffer.validation_replay_buffer, 10000, label="buffer", ) if variant.get("pretrain_policy", False): trainer.pretrain_policy_with_bc( policy, demo_train_buffer, demo_test_buffer, trainer.bc_num_pretrain_steps, ) if variant.get("pretrain_rl", False): trainer.pretrain_q_with_bc_data() if variant.get("save_pretrained_algorithm", False): p_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.p") pt_path = osp.join(logger.get_snapshot_dir(), "pretrain_algorithm.pt") data = algorithm._get_snapshot() data["algorithm"] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get("train_rl", True): algorithm.train()
def experiment(variant): if variant.get("pretrained_algorithm_path", False): resume(variant) return if 'env' in variant: env_params = ENV_PARAMS[variant['env']] variant.update(env_params) if 'env_id' in env_params: if env_params['env_id'] in ['pen-v0', 'pen-sparse-v0', 'door-v0', 'relocate-v0', 'hammer-v0', 'pen-sparse-v0', 'door-sparse-v0', 'relocate-sparse-v0', 'hammer-sparse-v0']: import mj_envs expl_env = gym.make(env_params['env_id']) eval_env = gym.make(env_params['env_id']) else: expl_env = NormalizedBoxEnv(variant['env_class']()) eval_env = NormalizedBoxEnv(variant['env_class']()) if variant.get('sparse_reward', False): expl_env = RewardWrapperEnv(expl_env, compute_hand_sparse_reward) eval_env = RewardWrapperEnv(eval_env, compute_hand_sparse_reward) if variant.get('add_env_demos', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_demo_path"]) if variant.get('add_env_offpolicy_data', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_offpolicy_data_path"]) else: expl_env = encoder_wrapped_env(variant) eval_env = encoder_wrapped_env(variant) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, 'info_sizes'): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() M = variant['layer_size'] vf_kwargs = variant.get("vf_kwargs", {}) vf1 = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], **vf_kwargs ) target_vf1 = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], **vf_kwargs ) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant['policy_kwargs'] policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) target_policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) buffer_policy_class = variant.get("buffer_policy_class", policy_class) buffer_policy = buffer_policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant.get("buffer_policy_kwargs", policy_kwargs), ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_policy = policy exploration_kwargs = variant.get('exploration_kwargs', {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error if variant.get('replay_buffer_class', EnvReplayBuffer) == AWREnvReplayBuffer: main_replay_buffer_kwargs = variant['replay_buffer_kwargs'] main_replay_buffer_kwargs['env'] = expl_env main_replay_buffer_kwargs['qf1'] = qf1 main_replay_buffer_kwargs['qf2'] = qf2 main_replay_buffer_kwargs['policy'] = policy else: main_replay_buffer_kwargs=dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer_kwargs = dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)( **main_replay_buffer_kwargs, ) if variant.get('use_validation_buffer', False): train_replay_buffer = replay_buffer validation_replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)( **main_replay_buffer_kwargs, ) replay_buffer = SplitReplayBuffer(train_replay_buffer, validation_replay_buffer, 0.9) trainer_class = variant.get("trainer_class", QuinoaTrainer) trainer = trainer_class( env=eval_env, policy=policy, vf1=vf1, target_policy=target_policy, target_vf1=target_vf1, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) if variant.get("save_video", False): if variant.get("presampled_goals", None): variant['image_env_kwargs']['presampled_goals'] = load_local_or_remote_file(variant['presampled_goals']).item() image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"]) image_eval_path_collector = ObsDictPathCollector( image_eval_env, eval_policy, observation_key="state_observation", ) image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"]) image_expl_path_collector = ObsDictPathCollector( image_expl_env, expl_policy, observation_key="state_observation", ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('save_initial_buffers', False): buffers = dict( replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, ) buffer_path = osp.join(logger.get_snapshot_dir(), 'buffers.p') pickle.dump(buffers, open(buffer_path, "wb")) if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def experiment(variant): import multiworld.envs.pygame env = gym.make('Point2DEnv-Image-v0') input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor( obs_processor=qf1_cnn, output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf1 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter( policy_cnn, policy_cnn.conv_output_flat_size, action_dim, ) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): render = variant.get("render", False) debug = variant.get("debug", False) if variant.get("pretrained_algorithm_path", False): resume(variant) return env_class = variant["env_class"] env_kwargs = variant["env_kwargs"] expl_env = env_class(**env_kwargs) eval_env = env_class(**env_kwargs) env = eval_env if variant.get('sparse_reward', False): expl_env = RewardWrapperEnv(expl_env, compute_hand_sparse_reward) eval_env = RewardWrapperEnv(eval_env, compute_hand_sparse_reward) if variant.get('add_env_demos', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_demo_path"]) if variant.get('add_env_offpolicy_data', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_offpolicy_data_path"]) if variant.get("use_masks", False): mask_wrapper_kwargs = variant.get("mask_wrapper_kwargs", dict()) expl_mask_distribution_kwargs = variant["expl_mask_distribution_kwargs"] expl_mask_distribution = DiscreteDistribution(**expl_mask_distribution_kwargs) expl_env = RewardMaskWrapper(env, expl_mask_distribution, **mask_wrapper_kwargs) eval_mask_distribution_kwargs = variant["eval_mask_distribution_kwargs"] eval_mask_distribution = DiscreteDistribution(**eval_mask_distribution_kwargs) eval_env = RewardMaskWrapper(env, eval_mask_distribution, **mask_wrapper_kwargs) env = eval_env path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = variant.get('achieved_goal_key', 'latent_achieved_goal') obs_dim = ( env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size ) action_dim = eval_env.action_space.low.size if hasattr(expl_env, 'info_sizes'): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() replay_buffer_kwargs=dict( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, ) replay_buffer_kwargs.update(variant.get('replay_buffer_kwargs', dict())) replay_buffer = ConcatToObsWrapper( ObsDictRelabelingBuffer(**replay_buffer_kwargs), ["resampled_goals", ], ) replay_buffer_kwargs.update(variant.get('demo_replay_buffer_kwargs', dict())) demo_train_buffer = ConcatToObsWrapper( ObsDictRelabelingBuffer(**replay_buffer_kwargs), ["resampled_goals", ], ) demo_test_buffer = ConcatToObsWrapper( ObsDictRelabelingBuffer(**replay_buffer_kwargs), ["resampled_goals", ], ) M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant['policy_kwargs'] policy_path = variant.get("policy_path", False) if policy_path: policy = load_local_or_remote_file(policy_path) else: policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) buffer_policy_path = variant.get("buffer_policy_path", False) if buffer_policy_path: buffer_policy = load_local_or_remote_file(buffer_policy_path) else: buffer_policy_class = variant.get("buffer_policy_class", policy_class) buffer_policy = buffer_policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant.get("buffer_policy_kwargs", policy_kwargs), ) expl_policy = policy exploration_kwargs = variant.get('exploration_kwargs', {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error trainer = AWACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) else: eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, desired_goal_key=desired_goal_key, render=render, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, render=render, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) if variant.get("save_video", False): renderer_kwargs = variant.get("renderer_kwargs", {}) save_video_kwargs = variant.get("save_video_kwargs", {}) def get_video_func( env, policy, tag, ): renderer = EnvRenderer(**renderer_kwargs) state_goal_distribution = GoalDictDistributionFromMultitaskEnv( env, desired_goal_keys=[desired_goal_key], ) image_goal_distribution = AddImageDistribution( env=env, base_distribution=state_goal_distribution, image_goal_key='image_desired_goal', renderer=renderer, ) img_env = InsertImageEnv(env, renderer=renderer) rollout_function = partial( rf.multitask_rollout, max_path_length=variant['max_path_length'], observation_key=observation_key, desired_goal_key=desired_goal_key, return_dict_obs=True, ) reward_fn = ContextualRewardFnFromMultitaskEnv( env=env, achieved_goal_from_observation=IndexIntoAchievedGoal(observation_key), desired_goal_key=desired_goal_key, achieved_goal_key="state_achieved_goal", ) contextual_env = ContextualEnv( img_env, context_distribution=image_goal_distribution, reward_fn=reward_fn, observation_key=observation_key, ) video_func = get_save_video_function( rollout_function, contextual_env, policy, tag=tag, imsize=renderer.width, image_format='CWH', **save_video_kwargs ) return video_func expl_video_func = get_video_func(expl_env, expl_policy, "expl") eval_video_func = get_video_func(eval_env, MakeDeterministic(policy), "eval") algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(expl_video_func) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc( policy, demo_train_buffer, demo_test_buffer, trainer.bc_num_pretrain_steps, ) if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) algorithm.train()
def experiment(variant): from multiworld.envs.mujoco import register_goal_example_envs register_goal_example_envs() eval_env = gym.make('Image48SawyerPushForwardEnv-v0') expl_env = gym.make('Image48SawyerPushForwardEnv-v0') # Hack for now eval_env.wrapped_env.transpose = True expl_env.wrapped_env.transpose = True img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant, args): # expl_env = NormalizedBoxEnv(gym.make(str(args.env))) # eval_env = NormalizedBoxEnv(gym.make(str(args.env))) expl_env = NormalizedBoxEnv(Mani2dEnv()) eval_env = NormalizedBoxEnv(Mani2dEnv()) setup_logger('DIAYNMUSIC_' + str(args.skill_dim) + '_' + args.env, variant=variant, snapshot_mode="last") ptu.set_gpu_mode(True) # optionally set the GPU (default=False) set_seed(args.seed) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size skill_dim = args.skill_dim M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) df = FlattenMlp( input_size=obs_dim, output_size=skill_dim, hidden_sizes=[M, M], ) # smile estimator mi_etimator = ConcatCritic(obs_dim, M, 2, "relu") smile_clip = 1.0 policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim, action_dim=action_dim, hidden_sizes=[M, M], skill_dim=skill_dim) eval_policy = MakeDeterministic(policy) eval_path_collector = DIAYNMdpPathCollector( eval_env, eval_policy, ) expl_step_collector = MdpStepCollector( expl_env, policy, ) replay_buffer = DIAYNEnvReplayBuffer( variant['replay_buffer_size'], expl_env, skill_dim, ) trainer = DIAYNMUSICTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, df=df, target_qf1=target_qf1, target_qf2=target_qf2, mi_estimator=mi_etimator, smile_clip=smile_clip, prio_extrio_bound=6, **variant['trainer_kwargs']) algorithm = DIAYNTorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = ENV_PARAMS[variant["env"]] variant.update(env_params) if "env_id" in env_params: expl_env = gym.make(env_params["env_id"]) eval_env = gym.make(env_params["env_id"]) else: expl_env = NormalizedBoxEnv(variant["env_class"]()) eval_env = NormalizedBoxEnv(variant["env_class"]()) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, "info_sizes"): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() replay_buffer_kwargs = dict( max_replay_buffer_size=variant["replay_buffer_size"], env=expl_env, ) M = variant["layer_size"] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant["policy_kwargs"], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) trainer = AWACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"], ) if variant["collection_mode"] == "online": expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) else: if variant.get("deterministic_exploration", False): expl_policy = eval_policy else: expl_policy = policy expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant["max_path_length"], batch_size=variant["batch_size"], num_epochs=variant["num_epochs"], num_eval_steps_per_epoch=variant["num_eval_steps_per_epoch"], num_expl_steps_per_train_loop=variant[ "num_expl_steps_per_train_loop"], num_trains_per_train_loop=variant["num_trains_per_train_loop"], min_num_steps_before_training=variant[ "min_num_steps_before_training"], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) if variant.get("save_paths", False): algorithm.post_train_funcs.append(save_paths) if variant.get("load_demos", False): path_loader_class = variant.get("path_loader_class", MDPPathLoader) path_loader = path_loader_class( trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs, ) path_loader.load_demos() if variant.get("pretrain_policy", False): trainer.pretrain_policy_with_bc() if variant.get("pretrain_rl", False): trainer.pretrain_q_with_bc_data() algorithm.train()
def experiment(variant): import gym from multiworld.envs.mujoco import register_custom_envs from multiworld.core.flat_goal_env import FlatGoalEnv register_custom_envs() expl_env = FlatGoalEnv( gym.make(variant['env_id']), obs_keys=['state_observation'], goal_keys=['xy_desired_goal'], append_goal_to_obs=False, ) eval_env = FlatGoalEnv( gym.make(variant['env_id']), obs_keys=['state_observation'], goal_keys=['xy_desired_goal'], append_goal_to_obs=False, ) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], ) algorithm.to(ptu.device) algorithm.train()
def get_algorithm(expl_env, eval_env, skill_dim, epochs, length, file=None): obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size skill_dim = skill_dim M = variant['layer_size'] if file: print("old policy") data = torch.load(file) policy = data['evaluation/policy'] qf1 = data['trainer/qf1'] qf2 = data['trainer/qf2'] target_qf1 = data['trainer/target_qf1'] target_qf2 = data['trainer/target_qf2'] df = data['trainer/df'] policy = data['trainer/policy'] eval_policy = MakeDeterministic(policy) else: print("new policy") qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) df = FlattenMlp( input_size=obs_dim, output_size=skill_dim, hidden_sizes=[M, M], ) policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim, action_dim=action_dim, hidden_sizes=[M, M], skill_dim=skill_dim) eval_policy = MakeDeterministic(policy) eval_path_collector = DIAYNMdpPathCollector( eval_env, eval_policy, ) expl_step_collector = MdpStepCollector( expl_env, policy, ) replay_buffer = DIAYNEnvReplayBuffer( variant['replay_buffer_size'], expl_env, skill_dim, ) trainer = DIAYNTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, df=df, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) variant['algorithm_kwargs']['num_epochs'] = epochs variant['algorithm_kwargs']['max_path_length'] = length algorithm = DIAYNTorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) log_dir = setup_logger('DIAYN_' + str(skill_dim) + '_' + expl_env.wrapped_env.spec.id, variant=variant) algorithm.log_dir = log_dir return algorithm