def main(): parser = base_config.get_base_config() parser = mf_config.get_mf_config(parser) parser = il_config.get_il_config(parser) args = base_config.make_parser(parser) if args.write_log: logger.set_file_handler(path=args.output_dir, prefix='gail-mfrl-mf-' + args.task, time_str=args.exp_id) # no random policy for model-free rl assert args.random_timesteps == 0 print('Training starts at {}'.format(init_path.get_abs_base_dir())) from mbbl.trainer import gail_trainer from mbbl.sampler import singletask_sampler from mbbl.worker import mf_worker import mbbl.network.policy.trpo_policy import mbbl.network.policy.ppo_policy policy_network = { 'ppo': mbbl.network.policy.ppo_policy.policy_network, 'trpo': mbbl.network.policy.trpo_policy.policy_network }[args.trust_region_method] # here the dynamics and reward are simply placeholders, which cannot be # called to pred next state or reward from mbbl.network.dynamics.base_dynamics import base_dynamics_network from mbbl.network.reward.GAN_reward import reward_network train(gail_trainer, singletask_sampler, mf_worker, base_dynamics_network, policy_network, reward_network, args)
def __init__(self, env_name, rand_seed, misc_info): super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() self._data_recorder = { 'record_flag': False, 'episode_data_buffer': [], 'timestep_data_buffer': [], 'num_episode': 1, 'data_name': '' } # return the reset as the gym? if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym': self._reset_return_obs_only = True self.observation_space, self.action_space = \ env_util.make_ob_ac_box(self._env) else: self._reset_return_obs_only = False # add timestep into the observation? if 'add_timestep_into_ob' in misc_info and \ misc_info['add_timestep_into_ob']: self._add_timestep_into_ob = True else: self._add_timestep_into_ob = False
def main(): parser = base_config.get_base_config() parser = cem_config.get_cem_config(parser) args = base_config.make_parser(parser) if args.write_log: if args.output_dir == None: args.output_dir = "log" else: pass log_path = str(args.output_dir)+'/pets-cem-' + str(args.task) + '/seed-' + str(args.seed) +'/num_planning_traj-' + str(args.num_planning_traj) +'/plannging depth-' + str(args.planning_depth) +'/timesteps_per_batch-' + str(args.timesteps_per_batch) +'/random_timesteps-' + str(args.random_timesteps) +'/max_timesteps-' + str(args.max_timesteps)+'/' logger.set_file_handler(path=log_path, prefix='', time_str="0") print('Training starts at {}'.format(init_path.get_abs_base_dir())) from mbbl.trainer import shooting_trainer from mbbl.sampler import singletask_pets_sampler from mbbl.worker import cem_worker from mbbl.network.policy.random_policy import policy_network if args.gt_dynamics: from mbbl.network.dynamics.groundtruth_forward_dynamics import \ dynamics_network else: from mbbl.network.dynamics.deterministic_forward_dynamics import \ dynamics_network if args.gt_reward: from mbbl.network.reward.groundtruth_reward import reward_network else: from mbbl.network.reward.deterministic_reward import reward_network train(shooting_trainer, singletask_pets_sampler, cem_worker, dynamics_network, policy_network, reward_network, args)
def __init__(self, args, worker_type, network_type): ''' @brief: the master agent has several actors (or samplers) to do the sampling for it. ''' super(sampler, self).__init__(args, worker_type, network_type) self._base_path = init_path.get_abs_base_dir() if self.args.num_ilqr_traj % self.args.num_workers != 0: logger.warning( 'Using a different number of workers so that number of' + 'planning path is a integer multiple times of the number of' + 'worker Current: {} planning_traj, {} worker'.format( self.args.num_ilqr_traj, self.args.num_workers)) self._damping_args = { 'factor': self.args.LM_damping_factor, 'min_damping': self.args.min_LM_damping, 'max_damping': self.args.max_LM_damping } self._ilqr_data_wrapper = ilqr_data_wrapper.ilqr_data_wrapper( self.args, self._env_info['ob_size'], self._env_info['action_size']) # @self._plan_data is shared with the @ilqr_data_wrapper._plan_data self._plan_data = self._ilqr_data_wrapper.get_plan_data()
def __init__(self, env_name, rand_seed, misc_info): super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() self._use_pets_reward = 'pets' in misc_info and misc_info['pets'] self._len_qpos, self._len_qvel = \ env_util.get_gym_q_info(self._env, self._current_version) self._done_counter = -1 # return the reset as the gym? if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym': self._reset_return_obs_only = True self.observation_space, self.action_space = \ self._env.observation_space, self._env.action_space # it's possible some environments have different obs self.observation_space = \ env_util.box(self._env_info['ob_size'], -1, 1) else: self._reset_return_obs_only = False if 'no_termination' in misc_info and misc_info['no_termination']: self._no_termination = True else: self._no_termination = False
def main(): parser = base_config.get_base_config() parser = rs_config.get_rs_config(parser) args = base_config.make_parser(parser) if args.write_log: logger.set_file_handler(path=args.output_dir, prefix='mbrl-rs' + args.task, time_str=args.exp_id) print('Training starts at {}'.format(init_path.get_abs_base_dir())) from mbbl.trainer import shooting_trainer from mbbl.sampler import singletask_sampler from mbbl.worker import rs_worker from mbbl.network.policy.random_policy import policy_network if args.gt_dynamics: from mbbl.network.dynamics.groundtruth_forward_dynamics import \ dynamics_network else: from mbbl.network.dynamics.deterministic_forward_dynamics import \ dynamics_network if args.gt_reward: from mbbl.network.reward.groundtruth_reward import reward_network else: from mbbl.network.reward.deterministic_reward import reward_network train(shooting_trainer, singletask_sampler, rs_worker, dynamics_network, policy_network, reward_network, args)
def __init__(self, args, worker_type, network_type): ''' @brief: the master agent has several actors (or samplers) to do the sampling for it. ''' self.args = args self._npr = np.random.RandomState(args.seed + 23333) if "quanser" not in args.task: self._observation_size, self._action_size, _ = \ env_register.io_information(self.args.task) else: if args.task == "quanser_qube": self._observation_size, self._action_size, _ = 6, 1, 500 elif args.task == "quanser_ball": self._observation_size, self._action_size, _ = 8, 2, 1000 elif args.task == "quanser_cartpole": self._observation_size, self._action_size, _ = 5, 1, 2000 self._worker_type = worker_type self._network_type = network_type # init the multiprocess actors self._task_queue = multiprocessing.JoinableQueue() self._result_queue = multiprocessing.Queue() self._init_workers() self._build_env() self._base_path = init_path.get_abs_base_dir() self._current_iteration = 0
def __init__(self, env_name, rand_seed, misc_info): super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() self._use_pets_reward = 'pets' in misc_info and misc_info['pets'] self._len_qpos, self._len_qvel = \ env_util.get_gym_q_info(self._env, self._current_version) # return the reset as the gym? if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym': self._reset_return_obs_only = True self.observation_space, self.action_space = \ self._env.observation_space, self._env.action_space # it's possible some environments have different obs self.observation_space = \ env_util.box(self._env_info['ob_size'], -1, 1) else: self._reset_return_obs_only = False if self._env_name == 'gym_cheetahO01': self._action_noise = 0.0 self._ob_noise = 0.1 elif self._env_name == 'gym_cheetahO001': self._action_noise = 0.0 self._ob_noise = 0.01 elif self._env_name == 'gym_cheetahA01': self._action_noise = 0.1 self._ob_noise = 0.0 elif self._env_name == 'gym_cheetahA003': self._action_noise = 0.03 self._ob_noise = 0.0
def main(): parser = base_config.get_base_config() parser = ilqr_config.get_ilqr_config(parser) args = base_config.make_parser(parser) if args.write_log: logger.set_file_handler(path=args.output_dir, prefix='mbrl-ilqr' + args.task, time_str=args.exp_id) print('Training starts at {}'.format(init_path.get_abs_base_dir())) from mbbl.trainer import shooting_trainer from mbbl.sampler import singletask_ilqr_sampler from mbbl.worker import model_worker from mbbl.network.policy.random_policy import policy_network if args.gt_dynamics: from mbbl.network.dynamics.groundtruth_forward_dynamics import \ dynamics_network else: from mbbl.network.dynamics.deterministic_forward_dynamics import \ dynamics_network if args.gt_reward: from mbbl.network.reward.groundtruth_reward import reward_network else: from mbbl.network.reward.deterministic_reward import reward_network if (not args.gt_reward) or not (args.gt_dynamics): raise NotImplementedError('Havent finished! Oooooops') train(shooting_trainer, singletask_ilqr_sampler, model_worker, dynamics_network, policy_network, reward_network, args)
def visualize_pose_from_expert_data(data_file, camera_id): expert_traj, pos_data, env_name, dt = load_pose_data(data_file, camera_id) pos_connection = POS_CONNECTION[env_name] # import pdb; pdb.set_trace() image_size = expert_traj[camera_id]['camera_info'][camera_id]['image_size'] image = np.zeros([image_size, image_size, 3], dtype=np.uint8) env, _ = make_env(env_name, 1234, {}) fig = plt.figure() for i_pos_id in range(100): i_pos_data = pos_data[i_pos_id] # render the image image = env.render(camera_id=camera_id, qpos=expert_traj[0]['qpos'][i_pos_id]) fig = plt.figure() visualize_pose(image, i_pos_data, pos_connection, show=False) fig.canvas.draw() plt_results = np.array(fig.canvas.renderer._renderer) print('Processing %d out of %d' % (i_pos_id, 100)) if i_pos_id == 0: width, height, _ = plt_results.shape output_dir = \ data_file.replace('.npy', '_' + str(camera_id) + '.mp4') video = cv2.VideoWriter( os.path.join(init_path.get_abs_base_dir(), output_dir), cv2.VideoWriter_fourcc(*'mp4v'), 40, (height, width)) plt.imshow(plt_results) video.write(plt_results[:, :, [2, 1, 0]]) plt.close() video.release()
def __init__(self, args, session, name_scope, observation_size, action_size): super(policy_network, self).__init__( args, session, name_scope, observation_size, action_size ) self._base_dir = init_path.get_abs_base_dir()
def train(trainer, sampler, worker, dynamics, policy, reward, args=None): logger.info('Training starts at {}'.format(init_path.get_abs_base_dir())) network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward} # make the trainer and sampler sampler_agent = make_sampler(sampler, worker, network_type, args) trainer_tasks, trainer_results, trainer_agent, init_weights = \ make_trainer(trainer, network_type, args) sampler_agent.set_weights(init_weights) timer_dict = OrderedDict() timer_dict['Program Start'] = time.time() totalsteps = 0 current_iteration = 0 while True: timer_dict['** Program Total Time **'] = time.time() # step 1: collect rollout data if current_iteration == 0 and args.random_timesteps > 0 and \ (not (args.gt_dynamics and args.gt_reward)): # we could first generate random rollout data for exploration logger.info( 'Generating {} random timesteps'.format(args.random_timesteps) ) rollout_data = sampler_agent.rollouts_using_worker_planning( args.random_timesteps, use_random_action=True ) else: rollout_data = sampler_agent.rollouts_using_worker_planning() timer_dict['Generate Rollout'] = time.time() # step 2: train the weights for dynamics and policy network training_info = {'network_to_train': ['dynamics', 'reward', 'policy']} trainer_tasks.put( (parallel_util.TRAIN_SIGNAL, {'data': rollout_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # step 4: update the weights sampler_agent.set_weights(training_return['network_weights']) timer_dict['Assign Weights'] = time.time() # log and print the results log_results(training_return, timer_dict) totalsteps = training_return['totalsteps'] if totalsteps > args.max_timesteps: break else: current_iteration += 1 # end of training sampler_agent.end() trainer_tasks.put((parallel_util.END_SIGNAL, None))
def __init__(self, args, worker_type, network_type): ''' @brief: the master agent has several actors (or samplers) to do the sampling for it. ''' super(sampler, self).__init__(args, worker_type, network_type) self._base_path = init_path.get_abs_base_dir()
def __init__(self, args, session, name_scope, observation_size, action_size): super(ggnn_dynamics_network, self).__init__(args, session, name_scope, observation_size, action_size) self._base_dir = init_path.get_abs_base_dir() self._debug_it = 0 return
def __init__(self, env_name, rand_seed, misc_info): super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() self._VIDEO_H = 100 self._VIDEO_W = 150 if 'width' in misc_info: self._VIDEO_W = misc_info['video_width'] if 'height' in misc_info: self._VIDEO_H = misc_info['video_height']
def __init__(self, args, session, name_scope, observation_size, action_size): self._base_dir = init_path.get_abs_base_dir() self._traj_depth = args.ilqr_depth self._num_gps_condition = 1 if args.gps_single_condition \ else args.num_ilqr_traj super(policy_network, self).__init__(args, session, name_scope, observation_size, action_size)
def __init__(self, args, ob_size, action_size, plan_data=None): self.args = args self._ob_size = ob_size self._action_size = action_size self._npr = np.random.RandomState(args.seed + 2333) self._set_data_shape() self._init_data(plan_data) self._base_path = init_path.get_abs_base_dir()
def train_mf(mb_steps, policy_weight, trainer, sampler, worker, dynamics, policy, reward, args=None): logger.info('Training starts at {}'.format(init_path.get_abs_base_dir())) network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward} # make the trainer and sampler sampler_agent = make_sampler(sampler, worker, network_type, args) trainer_tasks, trainer_results, trainer_agent, init_weights = \ make_trainer(trainer, network_type, args) # Initialize the policy with dagger policy weight. trainer_tasks.put((parallel_util.SET_POLICY_WEIGHT, policy_weight)) trainer_tasks.join() init_weights['policy'][0] = policy_weight sampler_agent.set_weights(init_weights) timer_dict = OrderedDict() timer_dict['Program Start'] = time.time() current_iteration = 0 while True: timer_dict['** Program Total Time **'] = time.time() # step 1: collect rollout data rollout_data = \ sampler_agent.rollouts_using_worker_playing(use_true_env=True) timer_dict['Generate Rollout'] = time.time() # step 2: train the weights for dynamics and policy network training_info = {'network_to_train': ['dynamics', 'reward', 'policy']} trainer_tasks.put( (parallel_util.TRAIN_SIGNAL, {'data': rollout_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # step 4: update the weights sampler_agent.set_weights(training_return['network_weights']) timer_dict['Assign Weights'] = time.time() # log and print the results log_results(training_return, timer_dict, mb_steps) if training_return['totalsteps'] > args.max_timesteps: break else: current_iteration += 1 # end of training sampler_agent.end() trainer_tasks.put((parallel_util.END_SIGNAL, None))
def __init__(self, args, session, name_scope, observation_size, action_size): env_info = env_register._ENV_INFO[args.task_name] self._image_width, self._image_height, self._image_channel = \ env_info['image_width'], env_info['image_height'], \ env_info['image_channel'] super(policy_network, self).__init__(args, session, name_scope, observation_size, action_size) self._base_dir = init_path.get_abs_base_dir()
def __init__(self, env_name='dm_humanoid-noise', rand_seed=1234, misc_info={}): self._base_path = init_path.get_abs_base_dir() super(env, self).__init__(env_name, rand_seed, misc_info) # the noise level if env_name in ['dm-humanoid-noise', 'cmu-humanoid-imitation']: self._noise_c = 0.01 else: assert env_name in ['dm-humanoid'] self._noise_c = 0
def __init__(self, env_name, rand_seed, misc_info): super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() # return the reset as the gym? if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym': self._reset_return_obs_only = True self.observation_space, self.action_space = \ self._env.observation_space, self._env.action_space # it's possible some environments have different obs self.observation_space = \ env_util.box(self._env_info['ob_size'], -1, 1) else: self._reset_return_obs_only = False
def __init__(self, args, session, name_scope, observation_size, action_size): super(dynamics_network, self).__init__(args, session, name_scope, observation_size, action_size) self._base_dir = init_path.get_abs_base_dir() self._replay_x0 = { 'data': np.zeros( [self.args.gps_init_state_replay_size, self._observation_size]), 'cursor': 0, 'size': 0 }
def __init__(self, args, session, name_scope, observation_size, action_size): ''' @input: @ob_placeholder: if this placeholder is not given, we will make one in this class. @trainable: If it is set to true, then the policy weights will be trained. It is useful when the class is a subnet which is not trainable ''' super(dynamics_network, self).__init__(args, session, name_scope, observation_size, action_size) self._base_dir = init_path.get_abs_base_dir()
def main(): parser = base_config.get_base_config() parser = rs_config.get_rs_config(parser) parser = il_config.get_il_config(parser) args = base_config.make_parser(parser) args = il_config.post_process_config(args) if args.write_log: args.log_path = logger.set_file_handler(path=args.output_dir, prefix='inverse_dynamics' + args.task, time_str=args.exp_id) print('Training starts at {}'.format(init_path.get_abs_base_dir())) train(args)
def __init__(self, args, network_type, task_queue, result_queue, name_scope='trainer'): self._num_gps_condition = 1 if args.gps_single_condition \ else args.num_ilqr_traj # the base agent super(trainer, self).__init__(args=args, network_type=network_type, task_queue=task_queue, result_queue=result_queue, name_scope=name_scope) self._base_path = init_path.get_abs_base_dir() self._iteration = 0
def __init__(self, args, session, name_scope, observation_size, action_size): self.args = args self._session = session self._name_scope = name_scope self._observation_size = observation_size self._action_size = action_size self._task_name = args.task_name self._network_shape = args.policy_network_shape self._npr = np.random.RandomState(args.seed) self._whitening_operator = {} self._whitening_variable = [] self._base_dir = init_path.get_abs_base_dir()
def __init__(self, env_name, rand_seed, misc_info): assert env_name in ['gym_humanoid', 'gym_slimhumanoid', 'gym_nostopslimhumanoid'] super(env, self).__init__(env_name, rand_seed, misc_info) self._base_path = init_path.get_abs_base_dir() self._len_qpos, self._len_qvel = \ env_util.get_gym_q_info(self._env, self._current_version) # return the reset as the gym? if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym': self._reset_return_obs_only = True self.observation_space, self.action_space = \ self._env.observation_space, self._env.action_space # it's possible some environments have different obs self.observation_space = \ env_util.box(self._env_info['ob_size'], -1, 1) else: self._reset_return_obs_only = False
def main(): parser = base_config.get_base_config() parser = metrpo_config.get_metrpo_config(parser) args = base_config.make_parser(parser) if args.write_log: logger.set_file_handler(path=args.output_dir, prefix='mbrl-metrpo-' + args.task, time_str=args.exp_id) print('Training starts at {}'.format(init_path.get_abs_base_dir())) from mbbl.trainer import metrpo_trainer from mbbl.sampler import singletask_metrpo_sampler from mbbl.worker import metrpo_worker from mbbl.network.dynamics.deterministic_forward_dynamics import dynamics_network from mbbl.network.policy.trpo_policy import policy_network from mbbl.network.reward.groundtruth_reward import reward_network train(metrpo_trainer, singletask_metrpo_sampler, metrpo_worker, dynamics_network, policy_network, reward_network, args)
def load_expert_data(traj_data_name, traj_episode_num): # the start of the training traj_base_dir = init_path.get_abs_base_dir() if not traj_data_name.endswith('.npy'): traj_data_name = traj_data_name + '.npy' data_dir = os.path.join(traj_base_dir, traj_data_name) assert os.path.exists(data_dir), \ logger.error('Invalid path: {}'.format(data_dir)) expert_trajectory = np.load(data_dir, encoding="latin1") # choose only the top trajectories if len(expert_trajectory) > traj_episode_num: logger.warning('Using only %d trajs out of %d trajs' % (traj_episode_num, len(expert_trajectory))) expert_trajectory = expert_trajectory[:min(traj_episode_num, len(expert_trajectory))] return expert_trajectory
def __init__(self, args, session, name_scope, observation_size, action_size): ''' @input: @ob_placeholder: if this placeholder is not given, we will make one in this class. @trainable: If it is set to true, then the policy weights will be trained. It is useful when the class is a subnet which is not trainable ''' super(reward_network, self).__init__( args, session, name_scope, observation_size, action_size ) self._base_dir = init_path.get_abs_base_dir() # load the expert data self._expert_trajectory_obs = expert_data_util.load_expert_trajectory( self.args.expert_data_name, self.args.traj_episode_num )