def main(): render = True # Parameters horizon = 60 total_repeats = 50 state_dim = 12 action_dim = 2 env_name = 'SawyerSlide' ### Prepare Environment ##### env = make( 'SawyerSlide', gripper_type="SlidePanelGripper", parameters_to_randomise=slide_full_randomisation, randomise_initial_conditions=False, use_camera_obs=False, use_object_obs=True, reward_shaping=True, use_indicator_object=False, has_renderer=render, has_offscreen_renderer=False, render_collision_mesh=False, render_visual_mesh=True, control_freq=10, horizon=60, ignore_done=False, camera_name="frontview", camera_height=256, camera_width=256, camera_depth=False, pid=True, ) env = FlattenWrapper(GymWrapper(env, ), keys='task-state', add_info=True) env._name = env_name env.reset() for policy_idx in range(1): ##### SETTING UP THE POLICY ######### method = ['Single', 'LSTM', 'EPI', 'UPOSI'][policy_idx] if (method == 'Single'): alg_idx = 1 elif (method == 'LSTM'): alg_idx = 2 elif (method == 'UPOSI'): alg_idx = 3 osi_l = 5 RANDOMISZED_ONLY = True DYNAMICS_ONLY = True CAT_INTERNAL = False # sliding env no need for internal state and action params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) params_dim = params.shape[ 0] # dimension of parameters for prediction if CAT_INTERNAL: internal_state_dim = env.get_internal_state_dimension() _, _, _, info = env.step(np.zeros(action_dim)) internal_action_dim = np.array( info["joint_velocities"]).shape[0] osi_input_dim = osi_l * (state_dim + action_dim + internal_state_dim + internal_action_dim) else: osi_input_dim = osi_l * (state_dim + action_dim) state_dim += params_dim elif (method == 'EPI'): alg_idx = 4 embed_dim = 10 traj_l = 10 NO_RESET = True embed_input_dim = traj_l * (state_dim + action_dim) ori_state_dim = state_dim state_dim += embed_dim else: continue alg_name = ['sac', 'td3', 'lstm_td3', 'uposi_td3', 'epi_td3'][alg_idx] if method == 'EPI' or method == 'UPOSI': randomisation_idx_range = 1 else: randomisation_idx_range = 4 for randomisation_idx in range(1, 2): #randomisation_idx_range goal_idx = 1 ###### SETTING UP THE ENVIRONMENT ###### randomisation_params = [ slide_full_randomisation, [], slide_force_randomisation, slide_force_noise_randomisation ] env.change_parameters_to_randomise( randomisation_params[randomisation_idx]) env.reset() if (render): # env.viewer.set_camera(camera_id=0) env.render() # choose which randomisation is applied randomisation_type = ['slide_full-randomisation', 'slide_no-randomisation', \ 'slide_force-randomisation', 'slide_force-&-noise-randomisation'][ randomisation_idx] number_random_params = [22, 0, 2, 8][randomisation_idx] folder_path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/' + method + '/' + alg_name + '/model/' path = folder_path + env_name + str( number_random_params) + '_' + alg_name try: policy = load(path=path, alg=alg_name, state_dim=state_dim, action_dim=action_dim) if method == 'UPOSI': osi_model = load_model(model_name='osi', path=path, input_dim=osi_input_dim, output_dim=params_dim) elif method == 'EPI': embed_model = load_model(model_name='embedding', path=path, input_dim=embed_input_dim, output_dim=embed_dim) embed_model.cuda() epi_policy_path = folder_path + env_name + str( number_random_params) + '_' + 'epi_ppo_epi_policy' epi_policy = load(path=epi_policy_path, alg='ppo', state_dim=ori_state_dim, action_dim=action_dim) except: print(method, randomisation_type) continue if (alg_name == 'lstm_td3'): hidden_out = (torch.zeros([1, 1, 512], dtype=torch.float).cuda(), \ torch.zeros([1, 1, 512], dtype=torch.float).cuda()) # initialize hidden state for lstm, (hidden, cell), each is (layer, batch, dim) last_action = np.array([0, 0]) ###################################### log_save_name = '{}_{}_{}_{}'.format(method, alg_name, randomisation_type, number_random_params) for repeat in range(total_repeats): # Reset environment obs = env.reset() if (render): # env.viewer.set_ (camera_id=0) env.render() # Setup logger log_path = '../../../../data/sliding/sim/{}/goal_{}/trajectory_log_{}.csv'.format( log_save_name, goal_idx, repeat) log_list = [ "step", "time", "cmd_j5", "cmd_j6", "obj_x", "obj_y", "obj_z", "sin_z", "cos_z", "obj_vx", "obj_vy", "obj_vz", "a_j5", "a_j6", "v_j5", "v_j6", ] logger = Logger(log_list, log_path, verbatim=render) i = 0 mujoco_start_time = env.sim.data.time if (alg_name == 'uposi_td3'): uposi_traj = [] # params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() params_state = np.concatenate((pre_params, obs)) elif (alg_name == 'epi_td3'): if NO_RESET: i = traj_l - 1 traj, [last_obs, last_state] = EPIpolicy_rollout( env, epi_policy, obs, mujoco_start_time=mujoco_start_time, logger=logger, data_grabber=None, max_steps=traj_l, params=None ) # only one traj; pass in params to ensure it's not changed state_action_in_traj = np.array( traj)[:, :-1] # remove the rewards embedding = embed_model( state_action_in_traj.reshape(-1)) embedding = embedding.detach().cpu().numpy() obs = last_obs # last observation env.set_state(last_state) # last underlying state else: traj, [last_obs, last_state] = EPIpolicy_rollout( env, epi_policy, obs, mujoco_start_time=mujoco_start_time, logger=None, data_grabber=None, max_steps=traj_l, params=None ) # only one traj; pass in params to ensure it's not changed state_action_in_traj = np.array( traj)[:, :-1] # remove the rewards embedding = embed_model( state_action_in_traj.reshape(-1)) embedding = embedding.detach().cpu().numpy() params = env.get_dynamics_parameters() env.randomisation_off() env.set_dynamics_parameters( params) # same as the rollout env obs = env.reset() env.randomisation_on() obs = np.concatenate((obs, embedding)) while (True): mujoco_elapsed = env.sim.data.time - mujoco_start_time #### CHOOSING THE ACTION ##### if (alg_name == 'lstm_td3'): hidden_in = hidden_out action, hidden_out = policy.get_action(obs, last_action, hidden_in, noise_scale=0.0) last_action = action elif (alg_name == 'uposi_td3'): # using internal state or not if CAT_INTERNAL: internal_state = env.get_internal_state() full_state = np.concatenate([obs, internal_state]) else: full_state = obs action = policy.get_action(params_state, noise_scale=0.0) else: action = policy.get_action(obs, noise_scale=0.0) ############################### next_obs, reward, done, info = env.step(action) if (alg_name == 'uposi_td3'): if CAT_INTERNAL: target_joint_action = info["joint_velocities"] full_action = np.concatenate( [action, target_joint_action]) else: full_action = action uposi_traj.append( np.concatenate((full_state, full_action))) if len(uposi_traj) >= osi_l: osi_input = stack_data(uposi_traj, osi_l) pre_params = osi_model(osi_input).detach().numpy() else: zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model( zero_osi_input).detach().numpy() next_params_state = np.concatenate( (pre_params, next_obs)) params_state = next_params_state elif (alg_name == 'epi_td3'): next_obs = np.concatenate((next_obs, embedding)) # Grab logging data # assert len(obs) == 12 logger.log( i, mujoco_elapsed, action[0], action[1], obs[0], obs[1], obs[2], obs[3], obs[4], obs[5], obs[6], obs[7], obs[8], obs[9], obs[10], obs[11], ) obs = next_obs if (render): env.render() i += 1 if (i >= horizon or done): break env.close()
def worker(id, td3_trainer, osi_model, osi_l, osi_input_dim, osi_batch_size, osi_itr, osi_pretrain_eps, env_name, environment_params, environment_wrappers,\ environment_wrapper_arguments, rewards_queue, eval_rewards_queue, success_queue,\ eval_success_queue, osi_loss_queue, eval_interval, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, noise_decay, update_itr, explore_noise_scale, \ eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path, seed=1): ''' the function for sampling with multi-processing ''' with torch.cuda.device(id % torch.cuda.device_count()): td3_trainer.to_cuda() print(td3_trainer, replay_buffer) env = make_env('robosuite.' + env_name, seed, id, environment_params, environment_wrappers, environment_wrapper_arguments)() action_dim = env.action_space.shape[0] frame_idx = 0 rewards = [] history_data = [] params_list = [] current_explore_noise_scale = explore_noise_scale # training loop for eps in range(max_episodes): episode_reward = 0 epi_traj = [] state = env.reset() # attach the dynamics parameters as states params = query_params( env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY ) # in reality, the true params cannot be queried params_state = np.concatenate((params, state)) current_explore_noise_scale = current_explore_noise_scale * noise_decay for step in range(max_steps): # using internal state or not if CAT_INTERNAL: internal_state = env.get_internal_state() full_state = np.concatenate([state, internal_state]) else: full_state = state # exploration steps if frame_idx > explore_steps: action = td3_trainer.policy_net.get_action( params_state, noise_scale=current_explore_noise_scale) else: action = td3_trainer.policy_net.sample_action() try: next_state, reward, done, info = env.step(action) if environment_params[ "has_renderer"] and environment_params[ "render_visual_mesh"]: env.render() except KeyboardInterrupt: print('Finished') if osi_model is None: td3_trainer.save_model(model_path) except MujocoException: print('MujocoException') break if info["unstable"]: # capture the case with cube flying away for pushing task break if CAT_INTERNAL: target_joint_action = info["joint_velocities"] full_action = np.concatenate([action, target_joint_action]) else: full_action = action epi_traj.append(np.concatenate( (full_state, full_action))) # internal state for osi only, not for up if len( epi_traj ) >= osi_l and osi_model is not None and eps > osi_pretrain_eps: osi_input = stack_data( epi_traj, osi_l ) # stack (s,a) to have same length as in the model input pre_params = osi_model(osi_input).detach().numpy() else: pre_params = params if osi_model is not None and eps > osi_pretrain_eps: if len(epi_traj) >= osi_l: osi_input = stack_data( epi_traj, osi_l ) # stack (s,a) to have same length as in the model input pre_params = osi_model(osi_input).detach().numpy() else: zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() else: pre_params = params next_params_state = np.concatenate((pre_params, next_state)) replay_buffer.push(params_state, action, reward, next_params_state, done) state = next_state params_state = next_params_state episode_reward += reward frame_idx += 1 if replay_buffer.get_length( ) > batch_size and osi_model is None: for i in range(update_itr): _ = td3_trainer.update( batch_size, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale) if done: break if osi_model is not None: # train osi history_data.append(np.array(epi_traj)) params_list.append(params) if eps % osi_batch_size == 0 and eps > 0: label, data = generate_data(params_list, history_data, length=osi_l) osi_model, loss = OnlineSIupdate(osi_model, data, label, epoch=osi_itr) osi_loss_queue.put(loss) # clear the dataset; alternative: using a offline buffer params_list = [] history_data = [] torch.save(osi_model.state_dict(), model_path + '_osi') print('OSI Episode: {} | Epoch Loss: {}'.format(eps, loss)) else: # train up print('Worker: ', id, '|Episode: ', eps, '| Episode Reward: ', episode_reward) rewards_queue.put(episode_reward) success_queue.put(info['success']) if eps % eval_interval == 0 and eps > 0: # plot(rewards, id) td3_trainer.save_model(model_path) eval_r, eval_succ = evaluate( env, td3_trainer.policy_net, up=True, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) eval_rewards_queue.put(eval_r) eval_success_queue.put(eval_succ) if osi_model is not None: # train osi torch.save(osi_model.state_dict(), model_path + '_osi') else: # train up td3_trainer.save_model(model_path)
# the replay buffer is a class, have to use torch manager to make it a proxy for sharing across processes BaseManager.register('ReplayBuffer', ReplayBuffer) manager = BaseManager() manager.start() replay_buffer = manager.ReplayBuffer( replay_buffer_size) # share the replay buffer through manager env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env( env_name) prefix = env_name + str(len(environment_params["parameters_to_randomise"]) ) # number of randomised parameters model_path = '../../../../../data/uposi_td3/model/' + prefix + '_uposi_td3' params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) params_dim = params.shape[0] # dimension of parameters for prediction print('Dimension of parameters for prediction: {}'.format(params_dim)) action_space = env.action_space ini_state_space = env.observation_space state_space = spaces.Box( -np.inf, np.inf, shape=(ini_state_space.shape[0] + params_dim, )) # add the dynamics param dim if CAT_INTERNAL: internal_state_dim = env.get_internal_state_dimension() print('Dimension of internal state: ', internal_state_dim) _, _, _, info = env.step(np.zeros(action_space.shape[0])) internal_action_dim = np.array(info["joint_velocities"]).shape[0] print('Dimension of internal action: ', internal_action_dim) osi_input_dim = osi_l * (ini_state_space.shape[0] +
def evaluate(env, policy, up=False, eval_eipsodes=5, Projection=False, proj_net=None, num_envs=5): ''' Evaluate the policy during training time with fixed dynamics :param: eval_episodes: evaluating episodes for each env :param: up (bool): universal policy conditioned on dynamics parameters ''' number_of_episodes = eval_eipsodes return_per_test_environment = [] success_per_test_environment = [] # initial_dynamics_params = env.get_dynamics_parameters() dynamics_params = testing_envionments_generator_random( env, num_envs) # before randomisation off ### SUGGESTED CHANGE #### initial_dynamics_params, _ = env.randomisation_off() ################### # do visualization # print('Starting testing') for params in dynamics_params: mean_return = 0.0 success = 0 # print('with parameters {} ...'.format(params)) for i in range(number_of_episodes): #### SUGGESTED CHANGE #### # First set the parameters and then reset for them to take effect. env.set_dynamics_parameters(params) obs = env.reset() p = env.get_dynamics_parameters() ####### # obs = env.reset() # env.set_dynamics_parameters(params) # set parameters after reset if up: params_list = query_params(env, randomised_only=True) if Projection and proj_net is not None: params_list = proj_net.get_context(params_list) obs = np.concatenate((params_list, obs)) episode_return = 0.0 done = False while not done: action = policy.get_action(obs) obs, reward, done, info = env.step(action) episode_return += reward if up: obs = np.concatenate((params_list, obs)) if (done): mean_return += episode_return success += int(info['success']) break mean_return /= number_of_episodes success_rate = float(success) / float(number_of_episodes) # print('the mean return is {}'.format(mean_return)) return_per_test_environment.append(mean_return) success_per_test_environment.append(success_rate) #### SUGGESTED CHANGE #### # Only need to call randomisation_on, this will restore the previous parameters and the # randoisation too. Note that this will only take effect in the next reset, where parameters # are going to be randomised again env.randomisation_on() ##### # env.set_dynamics_parameters(initial_dynamics_params) # for non-randomisation cases, need to set original dynamics parameters print('total_average_return_over_test_environments : {}'.format( np.mean(return_per_test_environment))) return np.mean(return_per_test_environment), np.mean( success_per_test_environment)
def main(): render = True # Parameters horizon = 50 total_repeats = 50 state_dim = 6 action_dim = 3 env_name = 'SawyerReach' ### Prepare Environment ##### env = make( 'SawyerReach', gripper_type="PushingGripper", parameters_to_randomise=[], randomise_initial_conditions=False, table_full_size=(0.8, 1.6, 0.719), use_camera_obs=False, use_object_obs=True, reward_shaping=True, use_indicator_object=False, has_renderer=render, has_offscreen_renderer=False, render_collision_mesh=False, render_visual_mesh=True, control_freq=10, horizon=100, ignore_done=False, camera_name="frontview", camera_height=256, camera_width=256, camera_depth=False, pid=True, success_radius=0.01 ) env = FlattenWrapper(GymWrapper(EEFXVelocityControl(env, dof=3,max_action = 0.1),),keys='task-state',add_info= True) env._name = env_name env.reset() for policy_idx in range(1): ##### SETTING UP THE POLICY ######### method = ['Single', 'LSTM', 'EPI', 'UPOSI'][policy_idx] if(method == 'Single'): alg_idx = 1 elif(method == 'LSTM'): alg_idx = 2 elif(method == 'UPOSI'): alg_idx = 3 osi_l = 5 RANDOMISZED_ONLY = True DYNAMICS_ONLY = True CAT_INTERNAL = True params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) params_dim = params.shape[0] # dimension of parameters for prediction if CAT_INTERNAL: internal_state_dim = env.get_internal_state_dimension() _, _, _, info = env.step(np.zeros(action_dim)) internal_action_dim = np.array(info["joint_velocities"]).shape[0] osi_input_dim = osi_l*(state_dim+action_dim+internal_state_dim+internal_action_dim) state_dim+=params_dim elif(method == 'EPI'): alg_idx = 4 embed_dim = 10 traj_l = 10 NO_RESET = True embed_input_dim = traj_l*(state_dim+action_dim) ori_state_dim = state_dim state_dim += embed_dim else: continue alg_name = ['sac', 'td3', 'lstm_td3', 'uposi_td3', 'epi_td3'][alg_idx] if method == 'EPI' or method == 'UPOSI': randomisation_idx_range = 1 else: randomisation_idx_range = 4 for randomisation_idx in range(1,2):#randomisation_idx_range for goal_idx, goal_pos in enumerate(REACH_GOALS): goal_idx = goal_idx+1 ###### SETTING UP THE ENVIRONMENT ###### randomisation_params = [reach_full_randomisation, [], reach_force_randomisation, reach_force_noise_randomisation] env.change_parameters_to_randomise(randomisation_params[randomisation_idx]) env._set_goal_neutral_offset(*goal_pos) env.reset() if (render): env.viewer.set_camera(camera_id=0) env.render() base_pos_in_world = env.sim.data.get_body_xpos("base") base_rot_in_world = env.sim.data.get_body_xmat("base").reshape((3, 3)) base_pose_in_world = T.make_pose(base_pos_in_world, base_rot_in_world) world_pose_in_base = T.pose_inv(base_pose_in_world) # choose which randomisation is applied randomisation_type = ['reach_full-randomisation', 'reach_no-randomisation', \ 'reach_force-randomisation', 'reach_force-&-noise-randomisation'][randomisation_idx] number_random_params = [14, 0, 1, 3][randomisation_idx] folder_path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/'+method +'/' + alg_name + '/model/' path = folder_path + env_name + str( number_random_params) + '_' + alg_name try: policy = load(path=path, alg=alg_name, state_dim=state_dim, action_dim=action_dim) if method == 'UPOSI': osi_model = load_model(model_name='osi', path=path, input_dim = osi_input_dim, output_dim=params_dim ) elif method == 'EPI': embed_model = load_model(model_name='embedding', path=path, input_dim = embed_input_dim, output_dim = embed_dim ) embed_model.cuda() epi_policy_path = folder_path + env_name + str(number_random_params) + '_' + 'epi_ppo_epi_policy' epi_policy = load(path=epi_policy_path, alg='ppo', state_dim=ori_state_dim, action_dim=action_dim ) except : print(method,',',randomisation_type) continue if (alg_name == 'lstm_td3'): hidden_out = (torch.zeros([1, 1, 512], dtype=torch.float).cuda(), \ torch.zeros([1, 1, 512], dtype=torch.float).cuda()) # initialize hidden state for lstm, (hidden, cell), each is (layer, batch, dim) last_action = np.array([0, 0, 0]) ###################################### log_save_name = '{}_{}_{}_{}'.format(method, alg_name, randomisation_type, number_random_params) for repeat in range(total_repeats): #Reset environment obs = env.reset() if (render): env.viewer.set_camera(camera_id=0) env.render() #Establish extra frame transforms base_rot_in_eef = env.init_right_hand_orn.T #Setup logger log_path = '../../../../data/reaching/sim/new_runs/{}/goal_{}/trajectory_log_{}.csv'.format(log_save_name,goal_idx,repeat) log_list = ["step", "time", "cmd_eef_vx", "cmd_eef_vy", "cmd_eef_vz", "eef_x", "eef_y", "eef_z", "eef_vx", "eef_vy", "eef_vz", "goal_x", "goal_y", "goal_z", "obs_0", "obs_1", "obs_2", "obs_3", "obs_4", "obs_5" ] logger = Logger(log_list, log_path, verbatim=render) i = 0 mujoco_start_time = env.sim.data.time if (alg_name == 'uposi_td3'): uposi_traj = [] # params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() params_state = np.concatenate((pre_params, obs)) elif (alg_name == 'epi_td3'): if NO_RESET: i=traj_l-1 traj, [last_obs, last_state] = EPIpolicy_rollout(env, epi_policy, obs, mujoco_start_time=mujoco_start_time, logger=logger, data_grabber=grab_data, max_steps=traj_l, params=None) # only one traj; pass in params to ensure it's not changed state_action_in_traj = np.array(traj)[:, :-1] # remove the rewards embedding = embed_model(state_action_in_traj.reshape(-1)) embedding = embedding.detach().cpu().numpy() obs = last_obs # last observation env.set_state(last_state) # last underlying state else: traj, [last_obs, last_state] = EPIpolicy_rollout(env, epi_policy, obs, mujoco_start_time=mujoco_start_time, logger=None, data_grabber=None, max_steps=traj_l, params=None) # only one traj; pass in params to ensure it's not changed state_action_in_traj = np.array(traj)[:, :-1] # remove the rewards embedding = embed_model(state_action_in_traj.reshape(-1)) embedding = embedding.detach().cpu().numpy() params = env.get_dynamics_parameters() env.randomisation_off() env.set_dynamics_parameters(params) # same as the rollout env obs = env.reset() #Reset to make the params take effect env.randomisation_on() obs=np.concatenate((obs, embedding)) while (True): mujoco_elapsed = env.sim.data.time - mujoco_start_time #### CHOOSING THE ACTION ##### if (alg_name == 'lstm_td3'): hidden_in = hidden_out action, hidden_out = policy.get_action(obs, last_action, hidden_in, noise_scale=0.0) last_action = action elif (alg_name == 'uposi_td3'): # using internal state or not if CAT_INTERNAL: internal_state = env.get_internal_state() full_state = np.concatenate([obs, internal_state]) else: full_state = obs action = policy.get_action(params_state, noise_scale=0.0) else: action = policy.get_action(obs, noise_scale=0.0) ############################## next_obs, reward, done, info = env.step(action) if (alg_name == 'uposi_td3'): if CAT_INTERNAL: target_joint_action = info["joint_velocities"] full_action = np.concatenate([action, target_joint_action]) else: full_action = action uposi_traj.append(np.concatenate((full_state, full_action))) if len(uposi_traj)>=osi_l: osi_input = stack_data(uposi_traj, osi_l) pre_params = osi_model(osi_input).detach().numpy() else: zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() next_params_state = np.concatenate((pre_params, next_obs)) params_state = next_params_state elif (alg_name == 'epi_td3'): next_obs=np.concatenate((next_obs, embedding)) #Grab logging data eef_pos_in_base, eef_vel_in_base, goal_pos_in_base = grab_data(info, world_pose_in_base) action_in_base = base_rot_in_eef.dot(action) logger.log(i, mujoco_elapsed, action_in_base[0], action_in_base[1], action_in_base[2], eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2], eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2], goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2], obs[0], obs[1], obs[2], obs[3], obs[4], obs[5], ) obs = next_obs if(render): env.render() i += 1 if (i >= horizon): break env.close()
def main(): render = False # Parameters total_repeats = 50 for env_name in ['SawyerSlide', 'SawyerReach','SawyerPush' ]: #, 'SawyerReach', 'SawyerSlide' for use_white_noise in [ False]: # True if(not use_white_noise): parameter_predictions = [] oracle_parameters = [] if(env_name == 'SawyerReach'): state_dim = 6 action_dim = 3 horizon = 50 task = 'reaching' elif(env_name == 'SawyerPush'): horizon = 80 state_dim = 10 action_dim = 2 task = 'pushing' elif(env_name == 'SawyerSlide'): horizon = 60 state_dim = 12 action_dim = 2 task = 'sliding' env = make_env(task,render) env.reset() osi_l = 5 method = 'UPOSI' RANDOMISZED_ONLY = True DYNAMICS_ONLY = True CAT_INTERNAL = True if (task == 'sliding'): CAT_INTERNAL = False params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) params_dim = params.shape[0] # dimension of parameters for prediction if CAT_INTERNAL: internal_state_dim = env.get_internal_state_dimension() _, _, _, info = env.step(np.zeros(action_dim)) internal_action_dim = np.array(info["joint_velocities"]).shape[0] osi_input_dim = osi_l * (state_dim + action_dim + internal_state_dim + internal_action_dim) else: osi_input_dim = osi_l * (state_dim + action_dim) state_dim += params_dim alg_name = 'uposi_td3' if(task == 'reaching'): GOALS = REACH_GOALS elif(task == 'pushing'): GOALS = PUSH_GOALS elif(task == 'sliding'): GOALS = SLIDE_GOALS for goal_idx, goal_pos in enumerate(GOALS): goal_idx = goal_idx+1 ###### SETTING UP THE ENVIRONMENT ###### if(task == 'reaching'): env._set_goal_neutral_offset(*goal_pos) elif(task == 'pushing'): start_pos = np.array([0., -16.75e-2]) env._set_gripper_neutral_offset(*start_pos) env._set_goal_neutral_offset(*goal_pos) env.reset() if (render): env.viewer.set_camera(camera_id=0) env.render() base_pos_in_world = env.sim.data.get_body_xpos("base") base_rot_in_world = env.sim.data.get_body_xmat("base").reshape((3, 3)) base_pose_in_world = T.make_pose(base_pos_in_world, base_rot_in_world) world_pose_in_base = T.pose_inv(base_pose_in_world) # choose which randomisation is applied randomisation_type = 'reach_full-randomisation' if (task == 'reaching'): number_random_params = 14 elif (task == 'pushing'): number_random_params = 23 elif (task == 'sliding'): number_random_params = 22 path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/'+method +'/' + alg_name + '/model/' + env_name + str( number_random_params) + '_' + alg_name try: policy = load(path=path, alg=alg_name, state_dim=state_dim, action_dim=action_dim) if(not use_white_noise): osi_model = load_model(model_name='osi', path=path, input_dim=osi_input_dim, output_dim=params_dim) except : print(method,',',randomisation_type) continue log_save_name = '{}_{}_{}_{}'.format(method, alg_name, randomisation_type, number_random_params) for repeat in range(total_repeats): #Reset environment obs = env.reset() #Establish extra frame transforms if(task != 'sliding'): base_rot_in_eef = env.init_right_hand_orn.T #Setup logger if(use_white_noise): noise_folder = 'noise' else: noise_folder = 'normal' log_path = '../../../../data/uposi_ablation/{}/{}/{}/goal_{}/trajectory_log_{}.csv'.format(task, noise_folder, log_save_name,goal_idx,repeat) if (task == 'reaching'): log_list = ["step", "time", "cmd_eef_vx", "cmd_eef_vy", "cmd_eef_vz", "eef_x", "eef_y", "eef_z", "eef_vx", "eef_vy", "eef_vz", "goal_x", "goal_y", "goal_z", "obs_0", "obs_1", "obs_2", "obs_3", "obs_4", "obs_5" ] elif (task == 'pushing'): log_list = ["step", "time", "cmd_eef_vx", "cmd_eef_vy", "goal_x", "goal_y", "goal_z", "eef_x", "eef_y", "eef_z", "eef_vx", "eef_vy", "eef_vz", "object_x", "object_y", "object_z", "object_vx", "object_vy", "object_vz", "z_angle", "obs_0", "obs_1", "obs_2", "obs_3", "obs_4", "obs_5", "obs_6", "obs_7", "obs_8", "obs_9"] elif (task == 'sliding'): log_list = ["step", "time", "cmd_j5", "cmd_j6", "obj_x", "obj_y", "obj_z", "sin_z", "cos_z", "obj_vx", "obj_vy", "obj_vz", "a_j5", "a_j6", "v_j5", "v_j6", ] logger = Logger(log_list, log_path, verbatim=render) i = 0 mujoco_start_time = env.sim.data.time if(use_white_noise): params = np.random.uniform(-1.,1.,size =(params_dim,)) params_state = np.concatenate((params, obs)) else: epi_traj = [] params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY) zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() oracle_parameters.append(params) parameter_predictions.append(pre_params) params_state = np.concatenate((pre_params, obs)) while (True): mujoco_elapsed = env.sim.data.time - mujoco_start_time #### CHOOSING THE ACTION ##### if CAT_INTERNAL: internal_state = env.get_internal_state() full_state = np.concatenate([obs, internal_state]) else: full_state = obs action = policy.get_action(params_state, noise_scale=0.0) ############################## try: next_obs, reward, done, info = env.step(action) except MujocoException(): print ('Mujoco exceptiop') if (use_white_noise): params = np.random.uniform(-1., 1., size = (params_dim,)) next_params_state = np.concatenate((params, next_obs)) params_state = next_params_state else: if CAT_INTERNAL: target_joint_action = info["joint_velocities"] full_action = np.concatenate([action, target_joint_action]) else: full_action = action epi_traj.append(np.concatenate((full_state, full_action))) if len(epi_traj)>=osi_l: osi_input = stack_data(epi_traj, osi_l) pre_params = osi_model(osi_input).detach().numpy() else: zero_osi_input = np.zeros(osi_input_dim) pre_params = osi_model(zero_osi_input).detach().numpy() oracle_parameters.append(params) parameter_predictions.append(pre_params) next_params_state = np.concatenate((pre_params, next_obs)) params_state = next_params_state if(task == 'reaching'): # Grab logging data eef_pos_in_base, eef_vel_in_base, goal_pos_in_base = grab_reach_data(info, world_pose_in_base) action_in_base = base_rot_in_eef.dot(action) logger.log(i, mujoco_elapsed, action_in_base[0], action_in_base[1], action_in_base[2], eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2], eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2], goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2], obs[0], obs[1], obs[2], obs[3], obs[4], obs[5], ) elif (task == 'pushing'): goal_pos_in_base, eef_pos_in_base, eef_vel_in_base, \ object_pos_in_base, object_vel_in_base, z_angle, = grab_push_data(info, world_pose_in_base) action_3d = np.concatenate([action, [0.0]]) action_3d_in_base = base_rot_in_eef.dot(action_3d) logger.log(i, mujoco_elapsed, action_3d_in_base[0], action_3d_in_base[1], goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2], eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2], eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2], object_pos_in_base[0], object_pos_in_base[1], object_pos_in_base[2], object_vel_in_base[0], object_vel_in_base[1], object_vel_in_base[2], z_angle[0], obs[0], obs[1], obs[2], obs[3], obs[4], obs[5], obs[6], obs[7], obs[8], obs[9], ) elif (task == 'sliding'): logger.log(i, mujoco_elapsed, action[0], action[1], obs[0], obs[1], obs[2], obs[3], obs[4], obs[5], obs[6], obs[7], obs[8], obs[9], obs[10], obs[11], ) obs = next_obs if(render): env.render() i += 1 if (i >= horizon): break if(not use_white_noise): parameter_predictions = np.array(parameter_predictions) oracle_parameters = np.array(oracle_parameters) percent_diff = np.abs((parameter_predictions-oracle_parameters)/2.)*100 average_percent_diff = np.nanmean(percent_diff[np.isfinite(percent_diff)]) print('{} percent diff :{}'.format(task,average_percent_diff)) save_path = '../../../../data/uposi_ablation/{}/{}/'.format(task,noise_folder) if(os.path.exists(save_path)): file = open(os.path.join(save_path, 'percent_diff.txt'), 'w') else: file = open(os.path.join(save_path, 'percent_diff.txt'), 'a') file.write('{} percent diff :{}'.format(task,average_percent_diff)) file.close() env.close()