def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=variant['randomize_env'], observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) observation_key = 'image' eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = ObsDictStepCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) else: raise NotImplementedError video_func = VideoSaveFunctionBullet(variant) # dump_buffer_func = BufferSaveFunction(variant) algorithm.post_train_funcs.append(video_func) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('carla-lane-dict-v0') eval_env = expl_env num_channels, img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = ( action_dim + qf_cnn.conv_output_flat_size ) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = ( action_dim + target_qf_cnn.conv_output_flat_size ) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs'] ) cnn_vae_params = variant['cnn_vae_params'] cnn_vae_params['conv_args'].update( input_width=img_width, input_height=img_height, input_channels=num_channels, ) vae_policy = ConvVAEPolicy( representation_size=cnn_vae_params['representation_size'], architecture=cnn_vae_params, action_dim=action_dim, input_channels=3, imsize=img_width, ) observation_key = 'image' eval_path_collector = CustomObsDictPathCollector( eval_env, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) vae_eval_path_collector = CustomObsDictPathCollector( eval_env, # eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) #with open(variant['buffer'], 'rb') as f: # replay_buffer = pickle.load(f) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) load_hdf5(expl_env, replay_buffer) trainer = BEARTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs'] ) expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, vae_evaluation_data_collector=vae_eval_path_collector, replay_buffer=replay_buffer, q_learning_alg=True, batch_rl=variant['batch_rl'], **variant['algo_kwargs'] ) video_func = VideoSaveFunctionBullet(variant) # dump_buffer_func = BufferSaveFunction(variant) algorithm.post_train_funcs.append(video_func) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #expl_env = carla_env.CarlaObsDictEnv(args=variant['env_args']) import gym import d4rl.carla expl_env = gym.make('carla-lane-dict-v0') eval_env = expl_env #num_channels, img_width, img_height = eval_env._wrapped_env.image_shape num_channels, img_width, img_height = eval_env.image_shape # num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) # obs_dim = 11 cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) observation_key = 'image' eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) expl_path_collector = CustomObsDictPathCollector( expl_env, observation_key=observation_key, ) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) load_hdf5(expl_env, replay_buffer) #load_buffer(buffer_path=variant['buffer'], replay_buffer=replay_buffer) # import ipdb; ipdb.set_trace() trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant['trainer_kwargs']) variant['algo_kwargs']['max_path_length'] = expl_env._max_episode_steps algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=True, **variant['algorithm_kwargs']) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): import gym import multiworld.envs.mujoco import multiworld.envs.pygame import railrl.samplers.rollout_functions as rf import railrl.torch.pytorch_util as ptu from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.torch.grill.launcher import get_state_experiment_video_save_function from railrl.torch.her.her_td3 import HerTd3 from railrl.torch.td3.td3 import TD3 from railrl.torch.networks import FlattenMlp, TanhMlpPolicy from railrl.data_management.obs_dict_replay_buffer import ( ObsDictReplayBuffer) from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from railrl.samplers.data_collector.path_collector import ObsDictPathCollector if 'env_id' in variant: eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] # desired_goal_key = variant['desired_goal_key'] # variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key # variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() # achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=eval_env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=eval_env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=eval_env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = ObsDictPathCollector( eval_env, policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) # if variant.get("save_video", False): # rollout_function = rf.create_rollout_function( # rf.multitask_rollout, # max_path_length=algorithm.max_path_length, # observation_key=observation_key, # desired_goal_key=algorithm.desired_goal_key, # ) # video_func = get_state_experiment_video_save_function( # rollout_function, # env, # policy, # variant, # ) # algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld multiworld.register_all_envs() env = gym.make('Image48SawyerReachXYEnv-v1') observation_key = 'image_proprio_observation' input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, added_fc_input_size=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(qf_cnn, Flatten()), output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(qf_cnn, Flatten()), output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(target_qf_cnn, Flatten()), output_size=1, input_size=action_dim + target_qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(target_qf_cnn, Flatten()), output_size=1, input_size=action_dim + target_qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( qf1_cnn, Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter(nn.Sequential(policy_cnn, Flatten()), policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, **variant['replay_buffer_kwargs'], ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = ObsDictStepCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def active_representation_learning_experiment(variant): import railrl.torch.pytorch_util as ptu from railrl.data_management.obs_dict_replay_buffer import ObsDictReplayBuffer from railrl.torch.networks import FlattenMlp from railrl.torch.sac.policies import TanhGaussianPolicy from railrl.torch.arl.active_representation_learning_algorithm import \ ActiveRepresentationLearningAlgorithm from railrl.torch.arl.representation_wrappers import RepresentationWrappedEnv from multiworld.core.image_env import ImageEnv from railrl.samplers.data_collector import MdpPathCollector preprocess_rl_variant(variant) model_class = variant.get('model_class') model_kwargs = variant.get('model_kwargs') model = model_class(**model_kwargs) model.representation_size = 4 model.imsize = 48 variant["vae_path"] = model reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) env = variant["env_class"](**variant['env_kwargs']) image_env = ImageEnv( env, variant.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, ) env = RepresentationWrappedEnv( image_env, model, ) uniform_dataset_fn = variant.get('generate_uniform_dataset_fn', None) if uniform_dataset_fn: uniform_dataset = uniform_dataset_fn( **variant['generate_uniform_dataset_kwargs']) else: uniform_dataset = None observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = env.observation_space.spaces[observation_key].low.size action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) vae = env.vae replay_buffer = ObsDictReplayBuffer(env=env, **variant['replay_buffer_kwargs']) model_trainer_class = variant.get('model_trainer_class') model_trainer_kwargs = variant.get('model_trainer_kwargs') model_trainer = model_trainer_class( model, **model_trainer_kwargs, ) # vae_trainer = ConvVAETrainer( # env.vae, # **variant['online_vae_trainer_kwargs'] # ) assert 'vae_training_schedule' not in variant, "Just put it in algo_kwargs" max_path_length = variant['max_path_length'] trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['twin_sac_trainer_kwargs']) # trainer = HERTrainer(trainer) eval_path_collector = MdpPathCollector( env, MakeDeterministic(policy), # max_path_length, # observation_key=observation_key, # desired_goal_key=desired_goal_key, ) expl_path_collector = MdpPathCollector( env, policy, # max_path_length, # observation_key=observation_key, # desired_goal_key=desired_goal_key, ) algorithm = ActiveRepresentationLearningAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, model=model, model_trainer=model_trainer, uniform_dataset=uniform_dataset, max_path_length=max_path_length, **variant['algo_kwargs']) algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = ( action_dim + qf_cnn.conv_output_flat_size ) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = ( action_dim + target_qf_cnn.conv_output_flat_size ) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) buffer_policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) # policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) # buffer_policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = AWRSACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) expl_policy = policy expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, **variant['expl_path_collector_kwargs'] ) eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) demo_test_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) path_loader_kwargs = variant.get("path_loader_kwargs", {}) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) save_paths = None # FIXME(avi) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def experiment(variant): env_params = dict( block_random=0.3, camera_random=0, simple_observations=False, continuous=True, remove_height_hack=True, render_mode="DIRECT", # render_mode="GUI", num_objects=5, max_num_training_models=900, target=False, test=False, ) expl_env = FlatEnv(KukaGraspingProceduralEnv(**env_params)) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) observation_key = 'image' eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = ObsDictPathCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = ObsDictStepCollector( expl_env, policy, observation_key=observation_key, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) else: raise NotImplementedError video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) # dump_buffer_func = BufferSaveFunction(variant) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def state_td3bc_experiment(variant): if variant.get('env_id', None): import gym import multiworld multiworld.register_all_envs() eval_env = gym.make(variant['env_id']) eval_env = MujocoGymToMultiEnv(eval_env) # eval_env = EncoderWrappedEnv(eval_env) expl_env = gym.make(variant['env_id']) expl_env = MujocoGymToMultiEnv(expl_env) # expl_env = EncoderWrappedEnv(expl_env) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es_strat = variant.get('es', 'ou') if es_strat == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=variant['exploration_noise'], min_sigma=variant['exploration_noise'], ) elif es_strat == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=variant['exploration_noise'], min_sigma=variant['exploration_noise'], # constant sigma epsilon=0, ) else: raise ValueError("invalid exploration strategy provided") obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = 0 # expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) demo_train_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size']) demo_test_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size'], ) if variant.get('td3_bc', True): td3_trainer = TD3BCTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_bc_trainer_kwargs']) else: td3_trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs']) trainer = td3_trainer # HERTrainer(td3_trainer) eval_path_collector = ObsDictPathCollector( # GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, # desired_goal_key=desired_goal_key, ) expl_path_collector = ObsDictPathCollector( # GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, # desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", True): if variant.get("presampled_goals", None): variant['image_env_kwargs'][ 'presampled_goals'] = load_local_or_remote_file( variant['presampled_goals']).item() image_eval_env = ImageEnv(eval_env, **variant["image_env_kwargs"]) image_eval_path_collector = ObsDictPathCollector( # GoalConditionedPathCollector( image_eval_env, policy, observation_key='state_observation', # desired_goal_key='state_desired_goal', ) image_expl_env = ImageEnv(expl_env, **variant["image_env_kwargs"]) image_expl_path_collector = ObsDictPathCollector( # GoalConditionedPathCollector( image_expl_env, expl_policy, observation_key='state_observation', # desired_goal_key='state_desired_goal', ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) if variant.get('load_demos', False): td3_trainer.load_demos() if variant.get('pretrain_policy', False): td3_trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): td3_trainer.pretrain_q_with_bc_data() algorithm.train()