def create_init_conditions(self): change_print_color.change('MAGENTA') print("\nCreating Initial Conditions...") initial_cond = self.task_params['init_cond'] ddof = 3 # Data dof (file): x, y, theta pdof = 3 # Pose dof (env): x, y, theta ntgt = self.task_params['ntargets'] for cc, cond in enumerate(initial_cond): condition = np.zeros(self.env.obs_dim) condition[:self.env.action_dim] = np.deg2rad(cond[:3]) cond_idx = 2*self.env.action_dim + pdof # EE pose will be obtained from sim data_idx = self.env.action_dim for tt in range(self.task_params['ntargets']): tgt_data = cond[data_idx:data_idx+ddof] # tgt_pose = create_quat_pose(pos_x=tgt_data[0], # pos_y=tgt_data[1], # pos_z=z_fix, # rot_yaw=np.deg2rad(tgt_data[2])) # condition[cond_idx:cond_idx+pdof] = tgt_pose tgt_data[2] = np.deg2rad(tgt_data[2]) condition[cond_idx:cond_idx+pdof] = tgt_data cond_idx += pdof data_idx += ddof self.env.add_init_cond(condition) return self.env.get_conditions()
def create_environment(self): """Instantiate an specific RL environment to interact with. Returns: RL environment """ change_print_color.change('BLUE') print("\nCreating Environment...") # Environment parameters env_with_img = False rdn_tgt_pos = False render = self.task_params['render'] obs_like_mjc = self.task_params['obs_like_mjc'] ntargets = self.task_params['ntargets'] tgt_weights = self.task_params['tgt_weights'] tgt_positions = self.task_params['tgt_positions'] tgt_types = self.task_params['tgt_types'] env = Pusher3DofBulletEnv(render=render, obs_with_img=env_with_img, obs_mjc_gym=obs_like_mjc, ntargets=ntargets, rdn_tgt_pos=rdn_tgt_pos, tgt_types=tgt_types) env.set_tgt_cost_weights(tgt_weights) env.set_tgt_pos(tgt_positions) print("Environment:%s OK!." % type(env).__name__) return env
def test_policy(self, pol_type=None, condition=0, iteration=-1): noise = np.zeros((self.task_params['T'], self.agent.act_dim)) if iteration == -1: for rr in range(600): temp_path = self.hyperparams['log_dir'] + ('/itr_%02d' % rr) if os.path.exists(temp_path): iteration += 1 if iteration == -1: print("There is not itr_XX data in '%s'" % self.hyperparams['log_dir']) return False dir_path = 'itr_%02d/' % iteration itr_data_file = dir_path + 'iteration_data_itr_%02d.pkl' % iteration change_print_color.change('BLUE') print("\nLoading iteration data '%s'..." % itr_data_file) itr_data = self.learn_algo.data_logger.unpickle(itr_data_file) policy = itr_data[condition].traj_distr self.env.reset(condition=condition) input('Press a key to start sampling...') sample = self.agent.sample(self.env, condition, self.task_params['T'], self.task_params['Ts'], noise, policy=policy, save=False) return True
def create_agent(self): change_print_color.change('CYAN') print("\nCreating Agent...") agent = NoPolAgent(act_dim=self.action_dim, obs_dim=self.obs_dim, state_dim=self.state_dim, agent_name='agent'+str('%02d' % self.hyperparams['run_num'])) print("Agent:%s OK\n" % type(agent).__name__) return agent
def create_agent(self): """Instantiate the RL agent who interacts with the environment. Returns: RL agent """ change_print_color.change('CYAN') print("\nCreating Agent...") policy_params = { 'network_model': tf_network, # tf_network, multi_modal_network, multi_modal_network_fp 'network_params': { 'n_layers': 2, # Number of Hidden layers 'dim_hidden': [32, 32], # List of size per n_layers 'obs_names': self.env.get_obs_info()['names'], 'obs_dof': self.env.get_obs_info()['dimensions'], # DoF for observation data tensor }, # Initialization. 'init_var': 0.1, # Initial policy variance. 'ent_reg': 0.0, # Entropy regularizer (Used to update policy variance) # Solver hyperparameters. 'iterations': self.task_params['tf_iterations'], # Number of iterations per inner iteration (Default:5000). 'batch_size': 15, 'lr': 0.001, # Base learning rate (by default it's fixed). 'lr_policy': 'fixed', # Learning rate policy. 'momentum': 0.9, # Momentum. 'weight_decay': 0.005, # Weight decay to prevent overfitting. 'solver_type': 'Adam', # Solver type (e.g. 'SGD', 'Adam', 'RMSPROP', 'MOMENTUM', 'ADAGRAD'). # GPU usage. 'use_gpu': self.task_params['use_gpu'], # Whether or not to use the GPU for training. 'gpu_id': 0, 'gpu_mem_percentage': self.task_params['gpu_mem_percentage'], # Training data. 'fc_only_iterations': 0, # Iterations of only FC before normal training # Others. 'random_seed': self.hyperparams['seed'] \ if self.task_params['tf_seed'] == -1 \ else self.task_params['tf_seed'], # TF random seed 'log_dir': self.hyperparams['log_dir'], # 'weights_file_prefix': EXP_DIR + 'policy', } policy_opt = {'type': PolicyOptTf, 'hyperparams': policy_params} agent = GPSAgent(act_dim=self.action_dim, obs_dim=self.obs_dim, state_dim=self.state_dim, policy_opt=policy_opt, agent_name='agent' + str('%02d' % self.hyperparams['run_num'])) print("Agent:%s OK\n" % type(agent).__name__) return agent
def train(self, itr_load=None): """Train the RL agent with the learning algorithm. Args: itr_load: Iteration number with which to start Returns: bool: True for success, False otherwise. """ change_print_color.change('WHITE') return self.learn_algo.run(itr_load)
def test_policy(self, pol_type=None, condition=0, iteration=-1): """Test the RL agent using the policy learned in the specificied iteration in the specific condition. Args: pol_type: 'global' or 'local' condition: Condition number to test the agent iteration: Iteration to test the agent Returns: bool: True for success, False otherwise. """ noise = np.zeros((self.task_params['T'], self.agent.act_dim)) if iteration == -1: for rr in range(600): temp_path = self.hyperparams['log_dir'] + ('/itr_%02d' % rr) if os.path.exists(temp_path): iteration += 1 if iteration == -1: print("There is not itr_XX data in '%s'" % self.hyperparams['log_dir']) return False dir_path = 'itr_%02d/' % iteration itr_data_file = dir_path + 'iteration_data_itr_%02d.pkl' % iteration change_print_color.change('BLUE') print("\nLoading iteration data '%s'..." % itr_data_file) itr_data = self.learn_algo.data_logger.unpickle(itr_data_file) policy = itr_data[condition].traj_distr stop = False while stop is False: self.env.reset(condition=condition) input('Press a key to start sampling...') sample = self.agent.sample(self.env, condition, self.task_params['T'], self.task_params['Ts'], noise, policy=policy, save=False) answer = input('Execute again. Write (n/N) to stop:') if answer.lower() in ['n']: stop = True return True
def create_agent(self): """Instantiate the RL agent who interacts with the environment. Returns: RL agent """ change_print_color.change('CYAN') print("\nCreating Agent...") agent = NoPolAgent(act_dim=self.action_dim, obs_dim=self.obs_dim, state_dim=self.state_dim, agent_name='agent'+str('%02d' % self.hyperparams['run_num'])) print("Agent:%s OK\n" % type(agent).__name__) return agent
def create_environment(self): """Instantiate an specific RL environment to interact with. Returns: RL environment """ change_print_color.change('BLUE') print("\nCreating Environment...") dt = self.task_params['Ts'] # Create a CENTAURO ROS EnvInterface env = CentauroDrillEnv(dt=dt) print("Environment:%s OK!." % type(env).__name__) return env
RH_name = 'RWrMot3' l_soft_hand_offset = np.array([0.000, -0.030, -0.210]) r_soft_hand_offset = np.array([0.000, 0.030, -0.210]) touching_drill_config = np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.0568, 0.2386, -0.2337, -1.6803, 0.2226, 0.0107, 0.5633, 0., 0., 0.0568, -0.2386, 0.2337, -1.6803, -0.2226, 0.0107, -0.5633 ]) # ################### # # ################### # # ### ENVIRONMENT ### # # ################### # # ################### # change_print_color.change('BLUE') print("\nCreating Bigman environment...") # Robot configuration interface = 'ros' body_part_active = 'RA' body_part_sensed = 'RA' command_type = 'effort' if body_part_active == 'RA': hand_y = -drill_size[1] / 2 - 0.02 hand_z = drill_size[2] / 2 + 0.02 hand_name = RH_name hand_offset = r_soft_hand_offset else: hand_y = drill_size[1] / 2 + 0.02
def create_learning_algo(self): change_print_color.change('YELLOW') print("\nConfiguring learning algorithm...\n") # Learning params resume_training_itr = None # Resume from previous training iteration # Dynamics learned_dynamics = {'type': DynamicsLRPrior, 'regularization': 1e-6, 'prior': { 'type': DynamicsPriorGMM, 'max_clusters': 20, # Maximum number of clusters to fit. 'min_samples_per_cluster': 40, # Minimum samples per cluster. 'max_samples': 20, # Max. number of trajectories to use for fitting the GMM at any given time. 'strength': 1.0, # Adjusts the strength of the prior. }, } init_traj_distr = {'type': init_pd, 'init_var': np.array([1.0, 1.0, 1.0])*5.0e-01, 'pos_gains': 0.001, # float or array 'vel_gains_mult': 0.01, # Velocity gains multiplier on pos_gains 'init_action_offset': None, 'dJoints': self.env.get_total_joints(), # Total joints in state 'state_to_pd': 'joints', # Joints 'dDistance': 6, } # Trajectory Optimization Method traj_opt_method = { 'type': DualistTrajOpt, 'good_const': self.task_params['consider_good'], # Use good constraints 'bad_const': self.task_params['consider_bad'], # Use bad constraints 'del0': 1e-4, # Eta updates for non-SPD Q-function (non-SPD correction step). 'del0_good': 1e-4, # Omega updates for non-SPD Q-function (non-SPD correction step). 'del0_bad': 1e-8, # Nu updates for non-SPD Q-function (non-SPD correction step). # 'eta_error_threshold': 1e16, # TODO: REMOVE, it is not used 'min_eta': 1e-8, # At min_eta, kl_div > kl_step 'max_eta': 1e16, # At max_eta, kl_div < kl_step 'min_omega': 1e-8, # At min_omega, kl_div > kl_step 'max_omega': self.task_params['max_omega'], #1e16, # At max_omega, kl_div < kl_step 'min_nu': 1e-8, # At min_nu, kl_div > kl_step 'max_nu': self.task_params['max_nu'], # At max_nu, kl_div < kl_step, 'step_tol': 0.1, 'bad_tol': 0.1, 'good_tol': 0.1, 'cons_per_step': False, # Whether or not to enforce separate KL constraints at each time step. #TODO: IF TRUE, MAYBE IT DOES WORK WITH MDGPS because it doesn't consider dual vars 'use_prev_distr': False, # Whether or not to measure expected KL under the previous traj distr. 'update_in_bwd_pass': True, # Whether or not to update the TVLG controller during the bwd pass. 'adam_alpha': 0.5, 'adam_max_iter': 500, 'weight_bad': self.task_params['weight_bad'], 'weight_good': self.task_params['weight_good'], } good_trajs = None bad_trajs = None dualtrajopt_hyperparams = { 'inner_iterations': self.task_params['inner_iterations'], # Times the trajectories are updated # G/B samples selection | Fitting 'good_samples': good_trajs, # Good samples demos 'bad_samples': bad_trajs, # Bad samples demos 'n_good_samples': self.task_params['n_good_samples'], # Number of good samples per each trajectory 'n_bad_samples': self.task_params['n_bad_samples'], # Number of bad samples per each trajectory 'n_good_buffer': self.task_params['n_good_buffer'], # Number of good samples in the buffer 'n_bad_buffer': self.task_params['n_bad_buffer'], # Number of bad samples in the buffer 'good_traj_selection_type': self.task_params['good_traj_selection_type'], # 'always', 'only_traj' 'bad_traj_selection_type': self.task_params['bad_traj_selection_type'], # 'always', 'only_traj' 'duality_dynamics_type': 'duality', # Samples to use to update the dynamics 'duality', 'iteration' # Initial dual variables 'init_eta': 0.1,#4.62, 'init_nu': 0.1, 'init_omega': 0.1, # KL step (epsilon) 'step_rule': 'laplace', # Whether to use 'laplace' or 'mc' cost in step adjustment 'kl_step': self.task_params['kl_step'], # Kullback-Leibler step (base_step) 'min_step_mult': 0.01, # Min possible value of step multiplier (multiplies kl_step) 'max_step_mult': 10.0, # Max possible value of step multiplier (multiplies kl_step) # KL bad (xi) 'kl_bad': self.task_params['kl_bad'], #4.2 # Xi KL base value | kl_div_b >= kl_bad 'min_bad_mult': 0.01, # Min possible value of step multiplier (multiplies base_kl_bad) 'max_bad_mult': 10.0, # Max possible value of step multiplier (multiplies base_kl_bad) # KL good (chi) 'kl_good': self.task_params['kl_good'], #2.0, # Chi KL base value | kl_div_g <= kl_good 'min_good_mult': 0.01, # Min possible value of step multiplier (multiplies base_kl_good) 'max_good_mult': 10.0, # Max possible value of step multiplier (multiplies base_kl_good) # LinearPolicy 'projection' 'init_pol_wt': 0.01, # TODO: remove need for init_pol_wt in MDGPS (It should not work with MDGPS) 'policy_sample_mode': 'add', # Mode to update dynamics prior (Not used in ConstantPolicyPrior) 'policy_prior': {'type': ConstantPolicyPrior, 'strength': 1e-4, }, 'min_bad_var': np.array([3.0, 3.0, 3.0])*1.0e-02, 'min_good_var': np.array([3.0, 3.0, 3.0])*1.0e-02, # TEMP Hyperparams 'min_bad_rel_diff': self.task_params['min_bad_rel_diff'], 'max_bad_rel_diff': self.task_params['max_bad_rel_diff'], 'mult_bad_rel_diff': self.task_params['mult_bad_rel_diff'], 'good_fix_rel_multi': self.task_params['good_fix_rel_multi'], } gps_hyperparams = { 'T': self.task_params['T'], # Total points 'dt': self.task_params['Ts'], 'iterations': self.task_params['iterations'], # GPS episodes --> K iterations # Samples 'num_samples': self.task_params['num_samples'], # Samples for exploration trajs --> N samples 'noisy_samples': True, 'seed': self.task_params['seed'], 'smooth_noise': True, # Apply Gaussian filter to noise generated 'smooth_noise_var': 5.0e+0, # np.power(2*Ts, 2), # Variance to apply to Gaussian Filter. In Kumar (2016) paper, it is the std dev of 2 Ts 'smooth_noise_renormalize': True, # Renormalize smooth noise to have variance=1 'noise_var_scale': 1.e-1*np.ones(self.action_dim), # Scale to Gaussian noise: N(0, 1)*sqrt(noise_var_scale), only if smooth_noise_renormalize # Cost 'cost': self.cost, # Conditions 'conditions': len(self.init_cond), # Total number of initial conditions 'train_conditions': self.task_params['train_cond'], # Indexes of conditions used for training # TrajDist 'init_traj_distr': init_traj_distr, 'fit_dynamics': True, 'dynamics': learned_dynamics, 'initial_state_var': 1e-2, # Max value for x0sigma in trajectories # TrajOpt 'traj_opt': traj_opt_method, 'max_ent_traj': 0.0, # Weight of maximum entropy term in trajectory optimization #TODO: CHECK THIS VALUE # Others 'algo_hyperparams': dualtrajopt_hyperparams, 'data_files_dir': self.hyperparams['log_dir'], } return DualTrajOpt(self.agent, self.env, **gps_hyperparams)
def create_cost(self): change_print_color.change('GRAY') print("\nCreating Costs...") # Action Cost weight = 1e0 # 1e-4 target = None act_cost = { 'type': CostAction, 'wu': np.ones(self.action_dim) * weight, 'target': target, # Target action value } # # FK Cost # fk_l1_cost = { # 'type': CostFK, # 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY # 'target_pose': target_distance_hand, # 'tgt_data_type': 'state', # 'state' or 'observation' # 'tgt_idx': bigman_env.get_state_info(name='distance_hand')['idx'], # 'op_point_name': hand_name, # 'op_point_offset': hand_offset, # 'joints_idx': bigman_env.get_state_info(name='link_position')['idx'], # 'joint_ids': bigman_params['joint_ids'][body_part_active], # 'robot_model': robot_model, # 'wp': np.array([3.0, 3.0]), # one dim less because 'quat' error | 1)orient 2)pos # 'evalnorm': evall1l2term, # #'evalnorm': evallogl2term, # 'l1': 1.0, # 1.0, # 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target # 'l2': 0.0, # 1.0, #1.0e-3, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target # 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm # 'wp_final_multiplier': 1, # 10 # } # # fk_l2_cost = { # 'type': CostFK, # 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY # 'target_pose': target_distance_hand, # 'tgt_data_type': 'state', # 'state' or 'observation' # 'tgt_idx': bigman_env.get_state_info(name='distance_hand')['idx'], # 'op_point_name': hand_name, # 'op_point_offset': hand_offset, # 'joints_idx': bigman_env.get_state_info(name='link_position')['idx'], # 'joint_ids': bigman_params['joint_ids'][body_part_active], # 'robot_model': robot_model, # # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]), # one dim less because 'quat' error | 1)orient 2)pos # #'wp': np.array([1.0, 1.0, 1.0, 3.0, 3.0, 3.0]), # one dim less because 'quat' error | 1)orient 2)pos # 'wp': np.array([3.0, 3.0, 3.0, 3.0, 3.0, 3.0]), # one dim less because 'quat' error | 1)orient 2)pos # 'evalnorm': evall1l2term, # #'evalnorm': evallogl2term, # 'l1': 0.0, # 1.0, # 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target # 'l2': 1.0, # 1.0, #1.0e-3, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target # 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm # 'wp_final_multiplier': 1, # 10 # } # State costs target_distance_object = np.zeros(2) # input(self.env.get_state_info()) # input(self.env.get_state_info(name='tgt0')['idx']) state_cost_distance = { 'type': CostState, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.0, # Weight for l1 norm 'l2': 1.0, # Weight for l2 norm 'alpha': 1e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'tgt0': { 'wp': np.array([1.0, 1.0]), # State weights - must be set. 'target_state': target_distance_object, # Target state - must be set. 'average': None, 'data_idx': self.env.get_state_info(name='tgt0')['idx'] }, }, } state_final_cost_distance = { 'type': CostState, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.0, # Weight for l1 norm 'l2': 1.0, # Weight for l2 norm 'alpha': 1e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'tgt0': { 'wp': np.array([1.0, 1.0]), # State weights - must be set. 'target_state': target_distance_object, # Target state - must be set. 'average': None, 'data_idx': self.env.get_state_info(name='tgt0')['idx'] }, }, } cost_safe_distance = { 'type': CostSafeDistance, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'tgt1': { 'wp': np.array([1.0, 1.0]), # State weights - must be set. 'safe_distance': np.array([0.15, 0.15]), 'outside_cost': np.array([0.0, 0.0]), 'inside_cost': np.array([1.0, 1.0]), 'data_idx': self.env.get_state_info(name='tgt1')['idx'] }, }, } cost_state_difference = { 'type': CostStateDifference, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.e-0, # Weight for l1 norm 'l2': 5.e-2, # Weight for l2 norm 'alpha': 1e-10, # Constant added in square root in l1 norm 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'ee': { 'wp': np.array([1.0, 1.0, 0.6]), # State weights - must be set. 'target_state': 'tgt0', # Target state - must be set. 'average': None, 'tgt_idx': self.env.get_state_info(name='tgt0')['idx'], 'data_idx': self.env.get_state_info(name='ee')['idx'], 'idx_to_use': [0, 1, 2], # All: X, Y, theta }, }, } cost_final_state_difference = { 'type': CostStateDifference, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.e-0, # Weight for l1 norm 'l2': 5.e-2, # Weight for l2 norm 'alpha': 1e-10, # Constant added in square root in l1 norm 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'ee': { 'wp': np.array([1.0, 1.0, 0.6]), # State weights - must be set. 'target_state': 'tgt0', # Target state - must be set. 'average': None, 'tgt_idx': self.env.get_state_info(name='tgt0')['idx'], 'data_idx': self.env.get_state_info(name='ee')['idx'], 'idx_to_use': [0, 1, 2], # All: X, Y, theta }, }, } safe_radius = 0.15 cost_safe_state_difference = { 'type': CostSafeStateDifference, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'ee': { 'wp': np.array([1.0, 1.0]), # State weights - must be set. 'safe_distance': np.sqrt([safe_radius**2/2, safe_radius**2/2]), 'outside_cost': np.array([0.0, 0.0]), 'inside_cost': np.array([1.0, 1.0]), 'target_state': 'tgt1', # Target state - must be set. 'tgt_idx': self.env.get_state_info(name='tgt1')['idx'][:2], 'data_idx': self.env.get_state_info(name='ee')['idx'][:2], 'idx_to_use': [0, 1], # Only X and Y }, }, } cost_final_safe_state_difference = { 'type': CostSafeStateDifference, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'ee': { 'wp': np.array([1.0, 1.0]), # State weights - must be set. 'safe_distance': np.sqrt([safe_radius**2/2, safe_radius**2/2]), 'outside_cost': np.array([0.0, 0.0]), 'inside_cost': np.array([1.0, 1.0]), 'target_state': 'tgt1', # Target state - must be set. 'tgt_idx': self.env.get_state_info(name='tgt1')['idx'][:2], 'data_idx': self.env.get_state_info(name='ee')['idx'][:2], 'idx_to_use': [0, 1], # Only X and Y }, }, } # Sum costs # costs_and_weights = [(act_cost, 1.0e-1), costs_and_weights = [(act_cost, 1.0e-5), # # (fk_cost, 1.0e-0), # (fk_l1_cost, 1.5e-1), # (fk_l2_cost, 1.0e-0), # # (fk_final_cost, 1.0e-0), # (fk_l1_final_cost, 1.5e-1), # (fk_l2_final_cost, 1.0e-0), (cost_state_difference, 5.0e-0), (cost_final_state_difference, 1.0e+3), (cost_safe_state_difference, 5.0e+1), (cost_final_safe_state_difference, 5.0e+4), # WORKING: # (cost_safe_distance, 1.0e+1), # (state_cost_distance, 5.0e-0), # (state_final_cost_distance, 1.0e+3), ] cost_sum = { 'type': CostSum, 'costs': [cw[0] for cw in costs_and_weights], 'weights': [cw[1] for cw in costs_and_weights], } return cost_sum
def create_cost(self): """Instantiate the cost that evaluates the RL agent performance. Returns: Cost Function """ change_print_color.change('GRAY') print("\nCreating Costs...") # Action Cost act_cost = { 'type': CostAction, 'wu': np.ones(self.action_dim) * 1e-4, 'target': None, # Target action value } # FK Cost target_distance_hand = np.zeros(6) fk_cost = { 'type': CostFK, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]), # one dim less because 'quat' error | 1)orient 2)pos 'wp': np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 1.0, # 1.0, # 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 1.0, # 1.0, #1.0e-3, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 1, # 10 } fk_l1_cost = { 'type': CostFK, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]), # one dim less because 'quat' error | 1)orient 2)pos 'wp': np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 1.0, # 1.0, # 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 0.0, # 1.0, #1.0e-3, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 1, # 10 } fk_l2_cost = { 'type': CostFK, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]), # one dim less because 'quat' error | 1)orient 2)pos 'wp': np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 0.0, # 1.0, # 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 1.0, # 1.0, #1.0e-3, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 1, # 10 } fk_final_cost = { 'type': CostFK, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], 'wp': np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 1.0, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 50, } fk_l1_final_cost = { 'type': CostFK, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], 'wp': np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 1.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 0.0, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 50, } fk_l2_final_cost = { 'type': CostFK, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'target_pose': target_distance_hand, 'tgt_data_type': 'state', # 'state' or 'observation' 'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'], 'op_point_name': self.env.env_params['hand_name'], 'op_point_offset': self.env.env_params['hand_offset'], 'joints_idx': self.env.get_state_info(name='link_position')['idx'], 'joint_ids': self.env.env_params['joint_ids'], 'robot_model': self.env.env_params['robot_model'], 'wp': np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0 ]), # one dim less because 'quat' error | 1)orient 2)pos 'evalnorm': evall1l2term, #'evalnorm': evallogl2term, 'l1': 0.0, # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target 'l2': 1.0, # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target 'alpha': 1.0e-5, # e-5, # Constant added in square root in l1 norm 'wp_final_multiplier': 50, } # State Cost target_distance_object = np.zeros(6) state_cost_distance = { 'type': CostState, 'ramp_option': RAMP_CONSTANT, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.0, # Weight for l1 norm 'l2': 0.0, # Weight for l2 norm 'alpha': 1e-2, # Constant added in square root in l1 norm 'wp_final_multiplier': 1.0, # Weight multiplier on final time step. 'data_types': { 'distance_object': { # 'wp': np.ones_like(target_state), # State weights - must be set. 'wp': np.array([1.0, 1.0, 1.0, 3.0, 3.0, 3.0]), # State weights - must be set. 'target_state': target_distance_object, # Target state - must be set. 'average': None, # (12, 3), 'data_idx': self.env.get_state_info(name='distance_object')['idx'] }, }, } state_final_cost_distance = { 'type': CostState, 'ramp_option': RAMP_FINAL_ONLY, # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY 'evalnorm': evall1l2term, # TODO: ALWAYS USE evall1l2term 'l1': 1.0, # Weight for l1 norm 'l2': 0.0, # Weight for l2 norm 'alpha': 1e-2, # Constant added in square root in l1 norm 'wp_final_multiplier': 10.0, # Weight multiplier on final time step. 'data_types': { 'distance_object': { # 'wp': np.ones_like(target_state), # State weights - must be set. 'wp': np.array([1.0, 1.0, 1.0, 3.0, 3.0, 3.0]), # State weights - must be set. 'target_state': target_distance_object, # Target state - must be set. 'average': None, # (12, 3), 'data_idx': self.env.get_state_info(name='distance_object')['idx'] }, }, } # Sum costs des_weights = self.task_params['cost_weights'] print('Costs weights:', des_weights) costs_and_weights = [ (act_cost, des_weights[0]), (fk_l1_cost, des_weights[1]), (fk_l2_cost, des_weights[2]), (fk_l1_final_cost, des_weights[3]), (fk_l2_final_cost, des_weights[4]), (state_cost_distance, des_weights[5]), (state_final_cost_distance, des_weights[6]), ] cost_sum = { 'type': CostSum, 'costs': [cw[0] for cw in costs_and_weights], 'weights': [cw[1] for cw in costs_and_weights], } return cost_sum
def create_learning_algo(self): change_print_color.change('YELLOW') print("\nConfiguring learning algorithm...\n") # Learning params resume_training_itr = None # Resume from previous training iteration # Dynamics learned_dynamics = None init_traj_distr = { 'type': init_pd, 'init_var': np.array([1.0, 1.0, 1.0]) * 5.0e-01, 'pos_gains': 0.001, # float or array 'vel_gains_mult': 0.01, # Velocity gains multiplier on pos_gains 'init_action_offset': None, 'dJoints': self.env.get_total_joints(), # Total joints in state 'state_to_pd': 'joints', # Joints 'dDistance': 6, } # Trajectory Optimization Method traj_opt_method = { 'type': TrajOptPI2, 'kl_threshold': 1.0, # KL-divergence threshold between old and new policies. 'covariance_damping': 2.0, # If greater than zero, covariance is computed as a multiple of the old covariance. # Multiplier is taken to the power (1 / covariance_damping). If greater than one, slows # down convergence and keeps exploration noise high for more iterations. 'min_temperature': 0.001, # Minimum bound of the temperature optimization for the soft-max probabilities of the # policy samples. 'use_sumexp': False, 'pi2_use_dgd_eta': False, 'pi2_cons_per_step': True, 'min_eta': 1e-8, 'max_eta': 1e16, 'del0': 1e-4, } good_trajs = None bad_trajs = None pi2trajopt_hyperparams = { 'inner_iterations': self.task_params[ 'inner_iterations'], # Times the trajectories are updated # Initial dual variables 'init_eta': 4.62, # KL step (epsilon) 'step_rule': 'laplace', # Whether to use 'laplace' or 'mc' cost in step adjustment 'kl_step': 0.2, # Kullback-Leibler step (base_step) 'min_step_mult': 0.01, # Min possible value of step multiplier (multiplies kl_step) 'max_step_mult': 10.0, # Max possible value of step multiplier (multiplies kl_step) } gps_hyperparams = { 'T': self.task_params['T'], # Total points 'dt': self.task_params['Ts'], 'iterations': self.task_params['iterations'], # GPS episodes --> K iterations # Samples 'num_samples': self.task_params[ 'num_samples'], # Samples for exploration trajs --> N samples 'noisy_samples': True, 'smooth_noise': True, # Apply Gaussian filter to noise generated 'smooth_noise_var': 5.0e+0, # np.power(2*Ts, 2), # Variance to apply to Gaussian Filter. In Kumar (2016) paper, it is the std dev of 2 Ts 'smooth_noise_renormalize': True, # Renormalize smooth noise to have variance=1 'noise_var_scale': 5.e-0 * np.ones( self.action_dim ), # Scale to Gaussian noise: N(0, 1)*sqrt(noise_var_scale), only if smooth_noise_renormalize # Cost 'cost': self.cost, # Conditions 'conditions': len(self.init_cond), # Total number of initial conditions 'train_conditions': self.task_params[ 'train_cond'], # Indexes of conditions used for training # TrajDist 'init_traj_distr': init_traj_distr, 'fit_dynamics': False, 'dynamics': learned_dynamics, 'initial_state_var': 1e-6, # Max value for x0sigma in trajectories # TrajOpt 'traj_opt': traj_opt_method, 'max_ent_traj': 0.0, # Weight of maximum entropy term in trajectory optimization #TODO: CHECK THIS VALUE # Others 'algo_hyperparams': pi2trajopt_hyperparams, 'data_files_dir': self.hyperparams['log_dir'], } return PI2TrajOpt(self.agent, self.env, **gps_hyperparams)