def get_roboverse_env_from_params(environment_params): import roboverse use_predictive_model = environment_params['use_predictive_model'] env_name = environment_params['env'] randomize = environment_params['randomize_env'] observation_mode = environment_params['obs'] reward_type = environment_params['reward_type'] single_obj_reward = environment_params['single_obj_reward'] all_random = environment_params['all_random'] trimodal_positions_choice = environment_params['trimodal_positions_choice'] num_objects = environment_params['num_objects'] base_env = roboverse.make( env_name, gui=False, randomize=randomize, observation_mode=observation_mode, reward_type=reward_type, single_obj_reward=single_obj_reward, normalize_and_flatten=False, num_objects=num_objects, all_random=all_random, trimodal_positions_choice=trimodal_positions_choice) if use_predictive_model: model_dir = environment_params['model_dir'] num_execution_per_step = environment_params['num_execution_per_step'] img_width = base_env.obs_img_dim env = PredictiveModelEnvWrapper(model_dir, num_execution_per_step, base_env=base_env, img_dim=img_width) return GymAdapter(domain=None, task=None, env=env) else: return GymAdapter(domain=None, task=None, env=base_env)
def create_env(randomize, reward_type, env_index, single_obj_reward): model_dir_local = "/home/huihanl/precog_nick/logs/esp_train_results/2020-07/" \ "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \ ".basic_image_rnn.BasicImageRNNBijection" model_dir_server = "/nfs/kun1/users/huihanl/" \ "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \ ".basic_image_rnn.BasicImageRNNBijection" model_dir_aws = "/home/ubuntu/07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection.basic_image_rnn.BasicImageRNNBijection" num_execution_per_step = 2 base_env = roboverse.make( "SawyerGraspOneV2-v0", gui=False, randomize=randomize, observation_mode="pixels_debug", reward_type=reward_type, single_obj_reward=single_obj_reward, normalize_and_flatten=True, ) img_width, img_height = base_env.obs_img_dim, base_env.obs_img_dim env = PredictiveModelEnvWrapper(model_dir_aws, num_execution_per_step, base_env=base_env, img_dim=img_width) return env
def load_widowx(): ''' Loads sim datasets (uses rlkit from cql-private) ''' from rlkit.data_management.load_buffer import load_data_from_npy import roboverse data_folder_path = '/home/jonathanyang0127/minibullet/data' data_file_path = data_folder_path + \ ('/may25_Widow250OneObjectGraspShed-v0_20K_save_all_noise_0.1_2021-05-25T19-27-54/' 'may25_Widow250OneObjectGraspShed-v0_20K_save_all_noise_0.1_2021-05-25T19-27-54_20000.npy') variant = { 'buffer': data_file_path, 'env': 'Widow250OneObjectGraspTrain-v0' } expl_env = roboverse.make(variant['env'], transpose_image=True) replay_buffer = load_data_from_npy(variant, expl_env, ['image']) train = WidowXDataset(replay_buffer, train=True, normalize=True, image_dims=(48, 48, 3)) val = WidowXDataset(replay_buffer, train=False, normalize=True, image_dims=(48, 48, 3)) return train, val
def experiment(variant): checkpoint_filepath = os.path.join(variant['checkpoint_dir'], 'itr_{}.pkl'.format( variant['checkpoint_epoch'])) checkpoint = torch.load(checkpoint_filepath) # the following does not work for Bullet envs yet # eval_env = checkpoint['evaluation/env'] # expl_env = checkpoint['exploration/env'] eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env policy = checkpoint['trainer/trainer'].policy eval_policy = checkpoint['evaluation/policy'] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) observation_key = 'image' online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][ 'max_path_length'] if variant['online_data_only']: replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env, observation_key=observation_key) else: replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key, extra_buffer_size=online_buffer_size) trainer_kwargs = variant['trainer_kwargs'] assert trainer_kwargs['min_q_weight'] > 0. trainer = checkpoint['trainer/trainer'] trainer.min_q_weight = trainer_kwargs['min_q_weight'] algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=False, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def main(args): timestamp = get_timestamp() if osp.exists(NFS_PATH): data_save_path = osp.join(NFS_PATH, args.save_directory) else: data_save_path = osp.join(__file__, "../..", "data", args.save_directory) data_save_path = osp.abspath(data_save_path) if not osp.exists(data_save_path): os.makedirs(data_save_path) env = roboverse.make(args.env_name, gui=args.gui, transpose_image=False) data = [] assert args.policy_name in policies.keys(), f"The policy name must be one of: {policies.keys()}" assert args.accept_trajectory_key in env.get_info().keys(), \ f"""The accept trajectory key must be one of: {env.get_info().keys()}""" policy_class = policies[args.policy_name] policy = policy_class(env) num_success = 0 num_saved = 0 num_attempts = 0 accept_trajectory_key = args.accept_trajectory_key progress_bar = tqdm(total=args.num_trajectories) while num_saved < args.num_trajectories: num_attempts += 1 traj, success, num_steps = collect_one_traj( env, policy, args.num_timesteps, args.noise, accept_trajectory_key) if success: if args.gui: print("num_timesteps: ", num_steps) data.append(traj) num_success += 1 num_saved += 1 progress_bar.update(1) elif args.save_all: data.append(traj) num_saved += 1 progress_bar.update(1) if args.gui: print("success rate: {}".format(num_success/(num_attempts))) progress_bar.close() print("success rate: {}".format(num_success / (num_attempts))) path = osp.join(data_save_path, "scripted_{}_{}.npy".format( args.env_name, timestamp)) print(path) np.save(path, data)
def create_env_action(randomize, reward_type, env_index, single_obj_reward, trimodal_positions): base_env = roboverse.make( "SawyerGraspOneV2-v0", gui=False, randomize=randomize, observation_mode="pixels_debug", reward_type=reward_type, single_obj_reward=single_obj_reward, normalize_and_flatten=True, all_random=True, trimodal_positions=trimodal_positions) return base_env
def keyboard_control(args): env = roboverse.make(args.env_name, gui=True) while True: take_action = False action = np.array([0, 0, 0, 0, 0, 0, 0], dtype='float32') keys = bullet.p.getKeyboardEvents() for qKey in keys: if qKey in KEY_TO_ACTION_MAPPING.keys(): action += KEY_TO_ACTION_MAPPING[qKey] take_action = True elif qKey in ENV_COMMANDS.keys(): ENV_COMMANDS[qKey](env) take_action = False if take_action: obs, rew, done, info = env.step(action) print(rew) time.sleep(0.1)
def create_env(randomize, reward_type, env_index, single_obj_reward): model_dir_local = "/home/huihanl/precog_nick/logs/esp_train_results/2020-07/" \ "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \ ".basic_image_rnn.BasicImageRNNBijection" model_dir_server = "/nfs/kun1/users/huihanl/" \ "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \ ".basic_image_rnn.BasicImageRNNBijection" model_dir_aws = "/home/ubuntu/07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection.basic_image_rnn.BasicImageRNNBijection" num_execution_per_step = 2 trimodal_positions6 = [ (0.8187814400771692, 0.21049907010351596, -0.3415106684025205), (0.739567302423451, 0.14341819851789023, -0.341380192135101), (0.6694743281369817, 0.26676337932361205, -0.3440640126774397) ] trimodal_positions5 = [ (0.6838513781345804, 0.13232607273283675, -0.34202212957838085), (0.765710933262148, 0.17467922332524338, -0.3351583030520283), (0.6789658820114347, 0.2548283056922718, -0.34103265910465325) ] trimodal_positions0 = [ (0.7044727231056753, 0.2885882337328971, -0.3434554430497813), (0.6950925950841189, 0.10769150255843486, -0.3408203519762647), (0.8303787805809874, 0.12114265344994643, -0.34381720097869056) ] trimodal_positions1 = [ (0.7543401984506464, 0.21230734165805112, -0.3400822998707757), (0.7250127303935135, 0.1172465013108163, -0.34050488361050935), (0.668601621056849, 0.20450849328681078, -0.34356676661067254) ] trimodal_positions2 = [ (0.6623195604074853, 0.1420046836817064, -0.34036979015824237), (0.7490314063602679, 0.11150642565578632, -0.34010550517840776), (0.8313050761244212, 0.186704691256355, -0.3444288731959656) ] trimodal_positions3 = [ (0.702806187582968, 0.282862951083425, -0.34329308543453), (0.8053413878164436, 0.15063075895870554, -0.34222136237585643), (0.6598757532001869, 0.10674964260605753, -0.34047814967568574) ] trimodal_positions4 = [ (0.7935222824622936, 0.19097678049219627, -0.336295087280328), (0.6742503469035555, 0.1988108314637719, -0.3439745727367933), (0.6661682273867254, 0.09909463325237348, -0.3399316482536911) ] trimodal_positions_choice = [ trimodal_positions0, trimodal_positions1, trimodal_positions2, trimodal_positions3, trimodal_positions4, trimodal_positions5, trimodal_positions6 ] trimodal_positions = trimodal_positions_choice[env_index] base_env = roboverse.make("SawyerGraspOneV2-v0", gui=False, randomize=randomize, observation_mode="pixels_debug", reward_type=reward_type, single_obj_reward=single_obj_reward, normalize_and_flatten=True, all_random=True, trimodal_positions=trimodal_positions) img_width, img_height = base_env.obs_img_dim, base_env.obs_img_dim env = PredictiveModelEnvWrapper(model_dir_aws, num_execution_per_step, base_env=base_env, img_dim=img_width) return env
info = self.get_info() return bool(info['button_pressed_percentage'] > self.button_pressed_success_thresh) def get_reward(self, info): if self.reward_type == "button_press": return float(info['button_pressed_success']) elif self.reward_type == "grasp": return float(info['grasp_success_target']) else: raise NotImplementedError if __name__ == "__main__": env = roboverse.make("Widow250RandPosButtonPressTwoObjGrasp-v0", gui=True, transpose_image=False) import time env.reset() # import IPython; IPython.embed() for j in range(5): object_utils.pop_up_button(env.objects['button']) time.sleep(1) object_utils.push_down_button(env.objects['button']) time.sleep(1) object_utils.pop_up_button(env.objects['button']) for i in range(20): obs, rew, done, info = env.step( np.asarray([-0.05, 0., 0., 0., 0., 0.5, 0.])) print("reward", rew, "info", info)
def get_info(self): info = super(Widow250DoubleDrawerEnv, self).get_info() info['drawer_top_x_pos'] = self.get_drawer_pos("drawer_top")[0] info['drawer_top_opened_percentage'] = \ self.get_drawer_opened_percentage("drawer_top") info['drawer_top_closed_success'] = info["drawer_top_opened_percentage"] \ < self.drawer_closed_success_thresh return info def is_top_drawer_closed(self): info = self.get_info() return info['drawer_top_closed_success'] if __name__ == "__main__": env = roboverse.make('Widow250DoubleDrawerCloseOpenNeutral-v0', gui=True, transpose_image=False) import time env.reset() # import IPython; IPython.embed() for j in range(5): for i in range(20): obs, rew, done, info = env.step( np.asarray([-0.05, 0., 0., 0., 0., 0.5, 0., 0.])) print("reward", rew, "info", info) time.sleep(0.1) env.reset()
parser = argparse.ArgumentParser() parser.add_argument("-n", "--num-trajectories", type=int, required=True) parser.add_argument("-t", "--num-timesteps", type=int, required=True) parser.add_argument("-e", "--env-name", type=str, required=True) parser.add_argument("--task-name", type=str, required=True) args = parser.parse_args() timestamp = timestamp() data_save_path = os.path.join(__file__, "../..", 'data', timestamp) data_save_path = os.path.abspath(data_save_path) if not os.path.exists(data_save_path): os.makedirs(data_save_path) data = [] env = roboverse.make(args.env_name, gui=True, control_mode='discrete_gripper') env.reset() for j in tqdm(range(args.num_trajectories)): success, images, traj = collect_one_trajectory(env, args.num_timesteps) while success != 'y' and success != 'Y': print("failed for trajectory {}, collect again".format(j)) success, images, traj = collect_one_trajectory( env, args.num_timesteps) data.append(traj) if j % 50 == 0: path = os.path.join( data_save_path, "{}_{}_{}_{}.npy".format(args.env_name, args.task_name,
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = ( action_dim + qf_cnn.conv_output_flat_size ) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = ( action_dim + target_qf_cnn.conv_output_flat_size ) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) buffer_policy = GaussianMixtureObsProcessorPolicy( obs_dim=policy_cnn.conv_output_flat_size, action_dim=action_dim, obs_processor=policy_obs_processor, **variant['policy_kwargs'] ) # policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) # buffer_policy = TanhGaussianPolicyAdapter( # policy_obs_processor, # policy_cnn.conv_output_flat_size, # action_dim, # **variant['policy_kwargs'] # ) observation_key = 'image' replay_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) trainer = AWRSACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant['trainer_kwargs'] ) expl_policy = policy expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, **variant['expl_path_collector_kwargs'] ) eval_policy = MakeDeterministic(policy) eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) demo_test_buffer = ObsDictReplayBuffer( variant['replay_buffer_size'], expl_env, observation_key=observation_key, ) path_loader_kwargs = variant.get("path_loader_kwargs", {}) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) save_paths = None # FIXME(avi) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def experiment(variant): checkpoint_filepath = os.path.join(variant['checkpoint_dir'], 'itr_{}.pkl'.format( variant['checkpoint_epoch'])) checkpoint = torch.load(checkpoint_filepath) eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) qf1 = ConcatCNN(**cnn_params) qf2 = ConcatCNN(**cnn_params) target_qf1 = ConcatCNN(**cnn_params) target_qf2 = ConcatCNN(**cnn_params) policy = checkpoint['evaluation/policy'] eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) observation_key = 'image' online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][ 'max_path_length'] if variant['online_data_only']: replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env, observation_key=observation_key) else: replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key, extra_buffer_size=online_buffer_size) trainer_kwargs = variant['trainer_kwargs'] trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **trainer_kwargs ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=False, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def __init__(self, env_name, checkpoint_filename, num_timesteps_per_traj, accept_trajectory_key, video_save_dir, save_all, save_images, add_robot_view, noise=0.1): self.env_name = env_name self.num_timesteps_per_traj = num_timesteps_per_traj self.accept_trajectory_key = accept_trajectory_key self.noise = noise chkpt_split = checkpoint_filename.split('/') logdir = chkpt_split[-2] + '-' + chkpt_split[-1].split('.')[0] self.video_save_dir = os.path.join(video_save_dir, logdir) self.save_all = save_all if save_images: self.save_function = self.save_images else: self.save_function = self.save_video self.image_size = 512 self.add_robot_view = add_robot_view if not os.path.exists(self.video_save_dir): os.makedirs(self.video_save_dir) # cam for pick and place (front view) # self.camera_target_pos = [0.55, 0.1, -0.30] # self.camera_roll = 0.0 # self.camera_pitch = -30.0 # self.camera_yaw = 180.0 # self.camera_distance = 0.50 # drawer cam (front view) # self.camera_target_pos = [0.60, 0.05, -0.30] # self.camera_roll = 0.0 # self.camera_pitch = -30.0 # self.camera_yaw = 180.0 # self.camera_distance = 0.50 # drawer cam (canonical view) self.camera_target_pos = [0.55, 0., -0.30] self.camera_roll = 0.0 self.camera_pitch = -30.0 self.camera_yaw = 150.0 self.camera_distance = 0.64 self.view_matrix_args = dict(target_pos=self.camera_target_pos, distance=self.camera_distance, yaw=self.camera_yaw, pitch=self.camera_pitch, roll=self.camera_roll, up_axis_index=2) self.view_matrix = bullet.get_view_matrix(**self.view_matrix_args) self.projection_matrix = bullet.get_projection_matrix( self.image_size, self.image_size) # end camera settings self.env = roboverse.make(self.env_name, gui=False, transpose_image=True) checkpoint = torch.load(checkpoint_filename) self.policy = checkpoint['evaluation/policy'].cpu() self.checkpoint_name = 'test123' self.trajectories_collected = 0
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode='state', reward_type='shaped') eval_env = expl_env action_dim = int(np.prod(eval_env.action_space.shape)) obs_dim = eval_env.observation_space.shape[0] M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], # Making it easier to visualize ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMdpPathCollector(expl_env, ) with open(variant['buffer'], 'rb') as f: replay_buffer = pickle.load(f) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def create_env_action(randomize, reward_type, env_index, single_obj_reward): trimodal_positions6 = [ (0.8187814400771692, 0.21049907010351596, -0.3415106684025205), (0.739567302423451, 0.14341819851789023, -0.341380192135101), (0.6694743281369817, 0.26676337932361205, -0.3440640126774397) ] trimodal_positions5 = [ (0.6838513781345804, 0.13232607273283675, -0.34202212957838085), (0.765710933262148, 0.17467922332524338, -0.3351583030520283), (0.6789658820114347, 0.2548283056922718, -0.34103265910465325) ] trimodal_positions0 = [ (0.7044727231056753, 0.2885882337328971, -0.3434554430497813), (0.6950925950841189, 0.10769150255843486, -0.3408203519762647), (0.8303787805809874, 0.12114265344994643, -0.34381720097869056) ] trimodal_positions1 = [ (0.7543401984506464, 0.21230734165805112, -0.3400822998707757), (0.7250127303935135, 0.1172465013108163, -0.34050488361050935), (0.668601621056849, 0.20450849328681078, -0.34356676661067254) ] trimodal_positions2 = [ (0.6623195604074853, 0.1420046836817064, -0.34036979015824237), (0.7490314063602679, 0.11150642565578632, -0.34010550517840776), (0.8313050761244212, 0.186704691256355, -0.3444288731959656) ] trimodal_positions3 = [ (0.702806187582968, 0.282862951083425, -0.34329308543453), (0.8053413878164436, 0.15063075895870554, -0.34222136237585643), (0.6598757532001869, 0.10674964260605753, -0.34047814967568574) ] trimodal_positions4 = [ (0.7935222824622936, 0.19097678049219627, -0.336295087280328), (0.6742503469035555, 0.1988108314637719, -0.3439745727367933), (0.6661682273867254, 0.09909463325237348, -0.3399316482536911) ] trimodal_positions_choice = [ trimodal_positions0, trimodal_positions1, trimodal_positions2, trimodal_positions3, trimodal_positions4, trimodal_positions5, trimodal_positions6 ] trimodal_positions = trimodal_positions_choice[env_index] base_env = roboverse.make("SawyerGraspOneV2-v0", gui=False, randomize=randomize, observation_mode="pixels_debug", reward_type=reward_type, single_obj_reward=single_obj_reward, normalize_and_flatten=True, all_random=True, trimodal_positions=trimodal_positions) return base_env
else: raise NotImplementedError string_end = args.checkpoint.find('-v0') env_string = args.checkpoint[string_start:string_end] if 'pixel' in args.checkpoint: obs_mode = 'pixels' elif 'state' in args.checkpoint: obs_mode = 'state' else: raise NotImplementedError print('using env: {}'.format(env_string)) env = roboverse.make('{}-v0'.format(env_string), gui=True, observation_mode=obs_mode, transpose_image=True) return_list = [] success_list = [] max_path_length = 30 for _ in range(100): obs = env.reset() if obs_mode == 'pixels': obs = obs['image'] ret = 0 for _ in range(max_path_length): action, _ = policy.get_action(obs) # print(action)
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) # obs_dim = 11 cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) observation_key = 'image' eval_path_collector = ObsDictPathCollector( eval_env, eval_policy, observation_key=observation_key, **variant['eval_path_collector_kwargs']) expl_path_collector = CustomObsDictPathCollector( expl_env, observation_key=observation_key, ) with open(variant['buffer'], 'rb') as f: replay_buffer = pickle.load(f) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode='state', reward_type='shaped', transpose_image=True) eval_env = expl_env action_dim = int(np.prod(eval_env.action_space.shape)) state_dim = 11 qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = action_dim + state_dim qf1 = MlpQf(**qf_kwargs) qf2 = MlpQf(**qf_kwargs) target_qf_kwargs = copy.deepcopy(qf_kwargs) target_qf1 = MlpQf(**target_qf_kwargs) target_qf2 = MlpQf(**target_qf_kwargs) policy_kwargs = copy.deepcopy(variant['policy_kwargs']) policy_kwargs['action_dim'] = action_dim policy_kwargs['obs_dim'] = state_dim policy = TanhGaussianPolicy(**policy_kwargs) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) with open(variant['buffer'], 'rb') as f: replay_buffer = pickle.load(f) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, q_learning_alg=False, # should be false for SAC batch_rl=variant['batch_rl'], **variant['algo_kwargs'] ) dump_buffer_func = BufferSaveFunction(variant) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=variant['randomize_env'], observation_mode=variant['obs'], reward_type='shaped', transpose_image=True) if variant['obs'] == 'pixels_debug': robot_state_dims = 11 elif variant['obs'] == 'pixels': robot_state_dims = 4 else: raise NotImplementedError expl_env = FlatEnv(expl_env, use_robot_state=variant['use_robot_state'], robot_state_dims=robot_state_dims) eval_env = expl_env img_width, img_height = eval_env.image_shape num_channels = 3 action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=img_width, input_height=img_height, input_channels=num_channels, ) if variant['use_robot_state']: cnn_params.update( added_fc_input_size=robot_state_dims, output_conv_channels=False, hidden_sizes=[400, 400], output_size=200, ) else: cnn_params.update( added_fc_input_size=0, output_conv_channels=True, output_size=None, ) qf_cnn = CNN(**cnn_params) if variant['use_robot_state']: qf_obs_processor = qf_cnn else: qf_obs_processor = nn.Sequential( qf_cnn, Flatten(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 if variant['use_robot_state']: qf_kwargs['input_size'] = (action_dim + qf_cnn.output_size) else: qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) if variant['use_robot_state']: target_qf_obs_processor = target_qf_cnn else: target_qf_obs_processor = nn.Sequential( target_qf_cnn, Flatten(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 if variant['use_robot_state']: target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.output_size) else: target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) if variant['use_robot_state']: policy_obs_processor = policy_cnn else: policy_obs_processor = nn.Sequential( policy_cnn, Flatten(), ) if variant['use_robot_state']: policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.output_size, action_dim, **variant['policy_kwargs']) else: policy = TanhGaussianPolicyAdapter(policy_obs_processor, policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) else: raise NotImplementedError video_func = VideoSaveFunctionBullet(variant) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode='state', reward_type='shaped', transpose_image=True) eval_env = expl_env action_dim = int(np.prod(eval_env.action_space.shape)) obs_dim = 11 M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'], ) buffer_policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'], ) trainer = AWRSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, buffer_policy=buffer_policy, **variant['trainer_kwargs']) expl_policy = policy expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer_kwargs = dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer = EnvReplayBuffer(**replay_buffer_kwargs) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) algorithm.to(ptu.device) demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, ) path_loader_kwargs = variant.get("path_loader_kwargs", {}) save_paths = None # FIXME(avi) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs) path_loader.load_demos() if variant.get('pretrain_policy', False): trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): trainer.pretrain_q_with_bc_data() if variant.get('save_pretrained_algorithm', False): p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p') pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt') data = algorithm._get_snapshot() data['algorithm'] = algorithm torch.save(data, open(pt_path, "wb")) torch.save(data, open(p_path, "wb")) if variant.get('train_rl', True): algorithm.train()
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) if variant['stoch_eval_policy']: eval_policy = policy else: eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key) trainer = BCTrainer( env=eval_env, policy=policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def __init__(self, env_name, scripted_policy_name, num_timesteps_per_traj, accept_trajectory_key, video_save_dir, save_all, save_images, add_robot_view, noise=0.1): self.env_name = env_name self.num_timesteps_per_traj = num_timesteps_per_traj self.accept_trajectory_key = accept_trajectory_key self.noise = noise self.video_save_dir = video_save_dir self.save_all = save_all if save_images: self.save_function = self.save_images else: self.save_function = self.save_video self.image_size = 512 self.add_robot_view = add_robot_view if not os.path.exists(self.video_save_dir): os.makedirs(self.video_save_dir) # camera settings (default) # self.camera_target_pos = [0.57, 0.2, -0.22] # self.camera_roll = 0.0 # self.camera_pitch = -10 # self.camera_yaw = 215 # self.camera_distance = 0.4 # drawer cam (front view) # self.camera_target_pos = [0.60, 0.05, -0.30] # self.camera_roll = 0.0 # self.camera_pitch = -30.0 # self.camera_yaw = 180.0 # self.camera_distance = 0.50 # drawer cam (canonical view) self.camera_target_pos = [0.55, 0., -0.30] self.camera_roll = 0.0 self.camera_pitch = -30.0 self.camera_yaw = 150.0 self.camera_distance = 0.64 self.view_matrix_args = dict(target_pos=self.camera_target_pos, distance=self.camera_distance, yaw=self.camera_yaw, pitch=self.camera_pitch, roll=self.camera_roll, up_axis_index=2) self.view_matrix = bullet.get_view_matrix(**self.view_matrix_args) self.projection_matrix = bullet.get_projection_matrix( self.image_size, self.image_size) # end camera settings self.env = roboverse.make(self.env_name, gui=False, transpose_image=False) assert scripted_policy_name in policies.keys() self.policy_name = scripted_policy_name policy_class = policies[scripted_policy_name] self.scripted_policy = policy_class(self.env) self.trajectories_collected = 0
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode='state', reward_type='shaped') eval_env = expl_env action_dim = int(np.prod(eval_env.action_space.shape)) state_dim = obs_dim = 11 M = 256 qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = action_dim + state_dim qf1 = MlpQf(**qf_kwargs) qf2 = MlpQf(**qf_kwargs) target_qf_kwargs = copy.deepcopy(qf_kwargs) target_qf1 = MlpQf(**target_qf_kwargs) target_qf2 = MlpQf(**target_qf_kwargs) policy_kwargs = copy.deepcopy(variant['policy_kwargs']) policy_kwargs['action_dim'] = action_dim policy_kwargs['obs_dim'] = state_dim policy = TanhGaussianPolicy(**policy_kwargs) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], latent_dim=action_dim * 2, ) eval_path_collector = CustomMdpPathCollector( eval_env, save_images=True, **variant['eval_path_collector_kwargs']) vae_eval_path_collector = CustomMdpPathCollector( eval_env, max_num_epoch_paths_saved=5, save_images=True, ) with open(variant['buffer'], 'rb') as f: replay_buffer = pickle.load(f) trainer = BEARTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, vae_evaluation_data_collector=vae_eval_path_collector, replay_buffer=replay_buffer, q_learning_alg=True, batch_rl=variant['batch_rl'], **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = roboverse.make(variant['env'], gui=False, randomize=True, observation_mode='state', reward_type='shaped', transpose_image=True) eval_env = expl_env action_dim = int(np.prod(eval_env.action_space.shape)) state_dim = eval_env.observation_space.shape[0] qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = action_dim + state_dim qf1 = MlpQf(**qf_kwargs) qf2 = MlpQf(**qf_kwargs) target_qf_kwargs = copy.deepcopy(qf_kwargs) target_qf1 = MlpQf(**target_qf_kwargs) target_qf2 = MlpQf(**target_qf_kwargs) policy_kwargs = copy.deepcopy(variant['policy_kwargs']) policy_kwargs['action_dim'] = action_dim policy_kwargs['obs_dim'] = state_dim policy = TanhGaussianPolicy(**policy_kwargs) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['scripted_policy']: if 'V3-v0' in variant['env']: scripted_policy = GraspV3ScriptedPolicy( expl_env, noise_std=variant['scripted_noise_std']) elif 'V4-v0' in variant['env']: scripted_policy = GraspV4ScriptedPolicy( expl_env, noise_std=variant['scripted_noise_std']) elif 'V5-v0' in variant['env']: scripted_policy = GraspV5ScriptedPolicy( expl_env, noise_std=variant['scripted_noise_std']) else: raise NotImplementedError else: scripted_policy = None if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, optional_expl_policy=scripted_policy, optional_expl_probability_init=0.5, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, optional_expl_policy=scripted_policy, optional_expl_probability=0.9, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) dump_buffer_func = BufferSaveFunction(variant) # algorithm.post_train_funcs.append(dump_buffer_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) qf1 = ConcatCNN(**cnn_params) qf2 = ConcatCNN(**cnn_params) target_qf1 = ConcatCNN(**cnn_params) target_qf2 = ConcatCNN(**cnn_params) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining(variant, expl_env, observation_key) # Translate 0/1 rewards to +4/+10 rewards. if variant['use_positive_rew']: if set(np.unique(replay_buffer._rewards)).issubset({0, 1}): replay_buffer._rewards = replay_buffer._rewards * 6.0 replay_buffer._rewards = replay_buffer._rewards + 4.0 assert set(np.unique(replay_buffer._rewards)).issubset( set(6.0 * np.array([0, 1]) + 4.0)) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs']) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()