def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) # TODO: remove with, once figured out the issue! with torch.autograd.set_detect_anomaly(True): algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] # q and policy netwroks qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) # initialize with bc or not if variant['bc_model'] is None: policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ).to(ptu.device) else: bc_model = Mlp( input_size=obs_dim, output_size=action_dim, hidden_sizes=[64, 64], output_activation=F.tanh, ).to(ptu.device) checkpoint = torch.load(variant['bc_model'], map_location=map_location) bc_model.load_state_dict(checkpoint['network_state_dict']) print('Loading bc model: {}'.format(variant['bc_model'])) # policy initialized with bc policy = TanhGaussianPolicy_BC( obs_dim=obs_dim, action_dim=action_dim, mean_network=bc_model, hidden_sizes=[M, M], ).to(ptu.device) # if bonus: define bonus networks if not variant['offline']: bonus_layer_size = variant['bonus_layer_size'] bonus_network = Mlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[bonus_layer_size, bonus_layer_size], output_activation=F.sigmoid, ).to(ptu.device) checkpoint = torch.load(variant['bonus_path'], map_location=map_location) bonus_network.load_state_dict(checkpoint['network_state_dict']) print('Loading bonus model: {}'.format(variant['bonus_path'])) if variant['initialize_Q'] and bonus_layer_size == M: target_qf1.load_state_dict(checkpoint['network_state_dict']) target_qf2.load_state_dict(checkpoint['network_state_dict']) print('Initialize QF1 and QF2 with the bonus model: {}'.format( variant['bonus_path'])) if variant['initialize_Q'] and bonus_layer_size != M: print( ' Size mismatch between Q and bonus- Turining off the initialization' ) # eval_policy = MakeDeterministic(policy) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) dataset = eval_env.unwrapped.get_dataset() load_hdf5(dataset, replay_buffer, max_size=variant['replay_buffer_size']) if variant['normalize']: obs_mu, obs_std = dataset['observations'].mean( axis=0), dataset['observations'].std(axis=0) bonus_norm_param = [obs_mu, obs_std] else: bonus_norm_param = [None] * 2 # shift the reward if variant['reward_shift'] is not None: rewards_shift_param = min(dataset['rewards']) - variant['reward_shift'] print('.... reward is shifted : {} '.format(rewards_shift_param)) else: rewards_shift_param = None if variant['offline']: trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, rewards_shift_param=rewards_shift_param, **variant['trainer_kwargs']) print('Agent of type offline SAC created') elif variant['bonus'] == 'bonus_add': trainer = SAC_BonusTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, bonus_network=bonus_network, beta=variant['bonus_beta'], use_bonus_critic=variant['use_bonus_critic'], use_bonus_policy=variant['use_bonus_policy'], use_log=variant['use_log'], bonus_norm_param=bonus_norm_param, rewards_shift_param=rewards_shift_param, device=ptu.device, **variant['trainer_kwargs']) print('Agent of type SAC + additive bonus created') elif variant['bonus'] == 'bonus_mlt': trainer = SAC_BonusTrainer_Mlt( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, bonus_network=bonus_network, beta=variant['bonus_beta'], use_bonus_critic=variant['use_bonus_critic'], use_bonus_policy=variant['use_bonus_policy'], bonus_norm_param=bonus_norm_param, rewards_shift_param=rewards_shift_param, device=ptu.device, **variant['trainer_kwargs']) print('Agent of type SAC + multiplicative bonus created') else: raise ValueError('Not implemented error') algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) qf1 = ConcatCNN(**cnn_params) qf2 = ConcatCNN(**cnn_params) target_qf1 = ConcatCNN(**cnn_params) target_qf2 = ConcatCNN(**cnn_params) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining(variant, expl_env, observation_key) # Translate 0/1 rewards to +4/+10 rewards. if variant['use_positive_rew']: if set(np.unique(replay_buffer._rewards)).issubset({0, 1}): replay_buffer._rewards = replay_buffer._rewards * 6.0 replay_buffer._rewards = replay_buffer._rewards + 4.0 assert set(np.unique(replay_buffer._rewards)).issubset( set(6.0 * np.array([0, 1]) + 4.0)) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs']) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: load_hdf5(d4rl.qlearning_dataset(eval_env), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M,], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M,], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M,], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M,], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M,], ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[750, 750], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector( eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer, max_size=variant['replay_buffer_size']) trainer = BEARTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()