Exemplo n.º 1
0
def main():
    parser = base_config.get_base_config()
    parser = mf_config.get_mf_config(parser)
    parser = il_config.get_il_config(parser)
    args = base_config.make_parser(parser)

    if args.write_log:
        logger.set_file_handler(path=args.output_dir,
                                prefix='gail-mfrl-mf-' + args.task,
                                time_str=args.exp_id)

    # no random policy for model-free rl
    assert args.random_timesteps == 0

    print('Training starts at {}'.format(init_path.get_abs_base_dir()))
    from mbbl.trainer import gail_trainer
    from mbbl.sampler import singletask_sampler
    from mbbl.worker import mf_worker
    import mbbl.network.policy.trpo_policy
    import mbbl.network.policy.ppo_policy

    policy_network = {
        'ppo': mbbl.network.policy.ppo_policy.policy_network,
        'trpo': mbbl.network.policy.trpo_policy.policy_network
    }[args.trust_region_method]

    # here the dynamics and reward are simply placeholders, which cannot be
    # called to pred next state or reward
    from mbbl.network.dynamics.base_dynamics import base_dynamics_network
    from mbbl.network.reward.GAN_reward import reward_network

    train(gail_trainer, singletask_sampler, mf_worker, base_dynamics_network,
          policy_network, reward_network, args)
Exemplo n.º 2
0
    def __init__(self, env_name, rand_seed, misc_info):
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()
        self._data_recorder = {
            'record_flag': False,
            'episode_data_buffer': [],
            'timestep_data_buffer': [],
            'num_episode': 1,
            'data_name': ''
        }

        # return the reset as the gym?
        if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym':
            self._reset_return_obs_only = True
            self.observation_space, self.action_space = \
                env_util.make_ob_ac_box(self._env)
        else:
            self._reset_return_obs_only = False

        # add timestep into the observation?
        if 'add_timestep_into_ob' in misc_info and \
                misc_info['add_timestep_into_ob']:
            self._add_timestep_into_ob = True
        else:
            self._add_timestep_into_ob = False
def main():
    parser = base_config.get_base_config()
    parser = cem_config.get_cem_config(parser)
    args = base_config.make_parser(parser)

    if args.write_log:
        if args.output_dir == None:
            args.output_dir = "log"
        else:
            pass
        log_path = str(args.output_dir)+'/pets-cem-' + str(args.task) + '/seed-' + str(args.seed) +'/num_planning_traj-' + str(args.num_planning_traj) +'/plannging depth-' + str(args.planning_depth) +'/timesteps_per_batch-' + str(args.timesteps_per_batch) +'/random_timesteps-' + str(args.random_timesteps) +'/max_timesteps-' + str(args.max_timesteps)+'/'
        logger.set_file_handler(path=log_path, prefix='', time_str="0")

    print('Training starts at {}'.format(init_path.get_abs_base_dir()))
    from mbbl.trainer import shooting_trainer
    from mbbl.sampler import singletask_pets_sampler
    from mbbl.worker import cem_worker
    from mbbl.network.policy.random_policy import policy_network

    if args.gt_dynamics:
        from mbbl.network.dynamics.groundtruth_forward_dynamics import \
            dynamics_network
    else:
        from mbbl.network.dynamics.deterministic_forward_dynamics import \
            dynamics_network

    if args.gt_reward:
        from mbbl.network.reward.groundtruth_reward import reward_network
    else:
        from mbbl.network.reward.deterministic_reward import reward_network

    train(shooting_trainer, singletask_pets_sampler, cem_worker,
          dynamics_network, policy_network, reward_network, args)
Exemplo n.º 4
0
    def __init__(self, args, worker_type, network_type):
        '''
            @brief:
                the master agent has several actors (or samplers) to do the
                sampling for it.
        '''
        super(sampler, self).__init__(args, worker_type, network_type)
        self._base_path = init_path.get_abs_base_dir()
        if self.args.num_ilqr_traj % self.args.num_workers != 0:
            logger.warning(
                'Using a different number of workers so that number of' +
                'planning path is a integer multiple times of the number of' +
                'worker Current: {} planning_traj, {} worker'.format(
                    self.args.num_ilqr_traj, self.args.num_workers))

        self._damping_args = {
            'factor': self.args.LM_damping_factor,
            'min_damping': self.args.min_LM_damping,
            'max_damping': self.args.max_LM_damping
        }
        self._ilqr_data_wrapper = ilqr_data_wrapper.ilqr_data_wrapper(
            self.args, self._env_info['ob_size'],
            self._env_info['action_size'])
        # @self._plan_data is shared with the @ilqr_data_wrapper._plan_data
        self._plan_data = self._ilqr_data_wrapper.get_plan_data()
Exemplo n.º 5
0
    def __init__(self, env_name, rand_seed, misc_info):
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()

        self._use_pets_reward = 'pets' in misc_info and misc_info['pets']

        self._len_qpos, self._len_qvel = \
            env_util.get_gym_q_info(self._env, self._current_version)
        self._done_counter = -1

        # return the reset as the gym?
        if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym':
            self._reset_return_obs_only = True
            self.observation_space, self.action_space = \
                self._env.observation_space, self._env.action_space
            # it's possible some environments have different obs
            self.observation_space = \
                env_util.box(self._env_info['ob_size'], -1, 1)
        else:
            self._reset_return_obs_only = False

        if 'no_termination' in misc_info and misc_info['no_termination']:
            self._no_termination = True
        else:
            self._no_termination = False
Exemplo n.º 6
0
def main():
    parser = base_config.get_base_config()
    parser = rs_config.get_rs_config(parser)
    args = base_config.make_parser(parser)

    if args.write_log:
        logger.set_file_handler(path=args.output_dir,
                                prefix='mbrl-rs' + args.task,
                                time_str=args.exp_id)

    print('Training starts at {}'.format(init_path.get_abs_base_dir()))
    from mbbl.trainer import shooting_trainer
    from mbbl.sampler import singletask_sampler
    from mbbl.worker import rs_worker
    from mbbl.network.policy.random_policy import policy_network

    if args.gt_dynamics:
        from mbbl.network.dynamics.groundtruth_forward_dynamics import \
            dynamics_network
    else:
        from mbbl.network.dynamics.deterministic_forward_dynamics import \
            dynamics_network

    if args.gt_reward:
        from mbbl.network.reward.groundtruth_reward import reward_network
    else:
        from mbbl.network.reward.deterministic_reward import reward_network

    train(shooting_trainer, singletask_sampler, rs_worker, dynamics_network,
          policy_network, reward_network, args)
Exemplo n.º 7
0
    def __init__(self, args, worker_type, network_type):
        '''
            @brief:
                the master agent has several actors (or samplers) to do the
                sampling for it.
        '''
        self.args = args
        self._npr = np.random.RandomState(args.seed + 23333)
        if "quanser" not in args.task:
            self._observation_size, self._action_size, _ = \
                env_register.io_information(self.args.task)
        else:
            if args.task == "quanser_qube":
                self._observation_size, self._action_size, _ = 6, 1, 500
            elif args.task == "quanser_ball":
                self._observation_size, self._action_size, _ = 8, 2, 1000
            elif args.task == "quanser_cartpole":
                self._observation_size, self._action_size, _ = 5, 1, 2000

        self._worker_type = worker_type
        self._network_type = network_type

        # init the multiprocess actors
        self._task_queue = multiprocessing.JoinableQueue()
        self._result_queue = multiprocessing.Queue()
        self._init_workers()
        self._build_env()
        self._base_path = init_path.get_abs_base_dir()

        self._current_iteration = 0
Exemplo n.º 8
0
    def __init__(self, env_name, rand_seed, misc_info):
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()

        self._use_pets_reward = 'pets' in misc_info and misc_info['pets']

        self._len_qpos, self._len_qvel = \
            env_util.get_gym_q_info(self._env, self._current_version)

        # return the reset as the gym?
        if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym':
            self._reset_return_obs_only = True
            self.observation_space, self.action_space = \
                self._env.observation_space, self._env.action_space
            # it's possible some environments have different obs
            self.observation_space = \
                env_util.box(self._env_info['ob_size'], -1, 1)
        else:
            self._reset_return_obs_only = False

        if self._env_name == 'gym_cheetahO01':
            self._action_noise = 0.0
            self._ob_noise = 0.1
        elif self._env_name == 'gym_cheetahO001':
            self._action_noise = 0.0
            self._ob_noise = 0.01
        elif self._env_name == 'gym_cheetahA01':
            self._action_noise = 0.1
            self._ob_noise = 0.0
        elif self._env_name == 'gym_cheetahA003':
            self._action_noise = 0.03
            self._ob_noise = 0.0
Exemplo n.º 9
0
def main():
    parser = base_config.get_base_config()
    parser = ilqr_config.get_ilqr_config(parser)
    args = base_config.make_parser(parser)

    if args.write_log:
        logger.set_file_handler(path=args.output_dir,
                                prefix='mbrl-ilqr' + args.task,
                                time_str=args.exp_id)

    print('Training starts at {}'.format(init_path.get_abs_base_dir()))
    from mbbl.trainer import shooting_trainer
    from mbbl.sampler import singletask_ilqr_sampler
    from mbbl.worker import model_worker
    from mbbl.network.policy.random_policy import policy_network

    if args.gt_dynamics:
        from mbbl.network.dynamics.groundtruth_forward_dynamics import \
            dynamics_network
    else:
        from mbbl.network.dynamics.deterministic_forward_dynamics import \
            dynamics_network

    if args.gt_reward:
        from mbbl.network.reward.groundtruth_reward import reward_network
    else:
        from mbbl.network.reward.deterministic_reward import reward_network

    if (not args.gt_reward) or not (args.gt_dynamics):
        raise NotImplementedError('Havent finished! Oooooops')

    train(shooting_trainer, singletask_ilqr_sampler, model_worker,
          dynamics_network, policy_network, reward_network, args)
Exemplo n.º 10
0
def visualize_pose_from_expert_data(data_file, camera_id):
    expert_traj, pos_data, env_name, dt = load_pose_data(data_file, camera_id)
    pos_connection = POS_CONNECTION[env_name]

    # import pdb; pdb.set_trace()
    image_size = expert_traj[camera_id]['camera_info'][camera_id]['image_size']
    image = np.zeros([image_size, image_size, 3], dtype=np.uint8)
    env, _ = make_env(env_name, 1234, {})

    fig = plt.figure()
    for i_pos_id in range(100):
        i_pos_data = pos_data[i_pos_id]

        # render the image
        image = env.render(camera_id=camera_id,
                           qpos=expert_traj[0]['qpos'][i_pos_id])

        fig = plt.figure()
        visualize_pose(image, i_pos_data, pos_connection, show=False)
        fig.canvas.draw()
        plt_results = np.array(fig.canvas.renderer._renderer)
        print('Processing %d out of %d' % (i_pos_id, 100))
        if i_pos_id == 0:
            width, height, _ = plt_results.shape
            output_dir = \
                data_file.replace('.npy', '_' + str(camera_id) + '.mp4')
            video = cv2.VideoWriter(
                os.path.join(init_path.get_abs_base_dir(), output_dir),
                cv2.VideoWriter_fourcc(*'mp4v'), 40, (height, width))
        plt.imshow(plt_results)
        video.write(plt_results[:, :, [2, 1, 0]])
        plt.close()

    video.release()
Exemplo n.º 11
0
    def __init__(self, args, session, name_scope,
                 observation_size, action_size):

        super(policy_network, self).__init__(
            args, session, name_scope, observation_size, action_size
        )
        self._base_dir = init_path.get_abs_base_dir()
Exemplo n.º 12
0
def train(trainer, sampler, worker, dynamics, policy, reward, args=None):
    logger.info('Training starts at {}'.format(init_path.get_abs_base_dir()))
    network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward}

    # make the trainer and sampler
    sampler_agent = make_sampler(sampler, worker, network_type, args)
    trainer_tasks, trainer_results, trainer_agent, init_weights = \
        make_trainer(trainer, network_type, args)
    sampler_agent.set_weights(init_weights)

    timer_dict = OrderedDict()
    timer_dict['Program Start'] = time.time()
    totalsteps = 0
    current_iteration = 0

    while True:
        timer_dict['** Program Total Time **'] = time.time()

        # step 1: collect rollout data
        if current_iteration == 0 and args.random_timesteps > 0 and \
                (not (args.gt_dynamics and args.gt_reward)):
            # we could first generate random rollout data for exploration
            logger.info(
                'Generating {} random timesteps'.format(args.random_timesteps)
            )
            rollout_data = sampler_agent.rollouts_using_worker_planning(
                args.random_timesteps, use_random_action=True
            )
        else:
            rollout_data = sampler_agent.rollouts_using_worker_planning()

        timer_dict['Generate Rollout'] = time.time()

        # step 2: train the weights for dynamics and policy network
        training_info = {'network_to_train': ['dynamics', 'reward', 'policy']}
        trainer_tasks.put(
            (parallel_util.TRAIN_SIGNAL,
             {'data': rollout_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()
        timer_dict['Train Weights'] = time.time()

        # step 4: update the weights
        sampler_agent.set_weights(training_return['network_weights'])
        timer_dict['Assign Weights'] = time.time()

        # log and print the results
        log_results(training_return, timer_dict)

        totalsteps = training_return['totalsteps']
        if totalsteps > args.max_timesteps:
            break
        else:
            current_iteration += 1

    # end of training
    sampler_agent.end()
    trainer_tasks.put((parallel_util.END_SIGNAL, None))
Exemplo n.º 13
0
 def __init__(self, args, worker_type, network_type):
     '''
         @brief:
             the master agent has several actors (or samplers) to do the
             sampling for it.
     '''
     super(sampler, self).__init__(args, worker_type, network_type)
     self._base_path = init_path.get_abs_base_dir()
    def __init__(self, args, session, name_scope, observation_size,
                 action_size):
        super(ggnn_dynamics_network,
              self).__init__(args, session, name_scope, observation_size,
                             action_size)
        self._base_dir = init_path.get_abs_base_dir()
        self._debug_it = 0

        return
Exemplo n.º 15
0
    def __init__(self, env_name, rand_seed, misc_info):
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()
        self._VIDEO_H = 100
        self._VIDEO_W = 150

        if 'width' in misc_info:
            self._VIDEO_W = misc_info['video_width']
        if 'height' in misc_info:
            self._VIDEO_H = misc_info['video_height']
Exemplo n.º 16
0
    def __init__(self, args, session, name_scope, observation_size,
                 action_size):

        self._base_dir = init_path.get_abs_base_dir()
        self._traj_depth = args.ilqr_depth
        self._num_gps_condition = 1 if args.gps_single_condition \
            else args.num_ilqr_traj

        super(policy_network, self).__init__(args, session, name_scope,
                                             observation_size, action_size)
    def __init__(self, args, ob_size, action_size, plan_data=None):

        self.args = args
        self._ob_size = ob_size
        self._action_size = action_size
        self._npr = np.random.RandomState(args.seed + 2333)

        self._set_data_shape()
        self._init_data(plan_data)
        self._base_path = init_path.get_abs_base_dir()
Exemplo n.º 18
0
def train_mf(mb_steps, policy_weight, trainer, sampler, worker, dynamics,
             policy, reward, args=None):
    logger.info('Training starts at {}'.format(init_path.get_abs_base_dir()))
    network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward}

    # make the trainer and sampler
    sampler_agent = make_sampler(sampler, worker, network_type, args)
    trainer_tasks, trainer_results, trainer_agent, init_weights = \
        make_trainer(trainer, network_type, args)

    # Initialize the policy with dagger policy weight.
    trainer_tasks.put((parallel_util.SET_POLICY_WEIGHT, policy_weight))
    trainer_tasks.join()
    init_weights['policy'][0] = policy_weight
    sampler_agent.set_weights(init_weights)

    timer_dict = OrderedDict()
    timer_dict['Program Start'] = time.time()
    current_iteration = 0

    while True:
        timer_dict['** Program Total Time **'] = time.time()

        # step 1: collect rollout data
        rollout_data = \
            sampler_agent.rollouts_using_worker_playing(use_true_env=True)

        timer_dict['Generate Rollout'] = time.time()

        # step 2: train the weights for dynamics and policy network
        training_info = {'network_to_train': ['dynamics', 'reward', 'policy']}
        trainer_tasks.put(
            (parallel_util.TRAIN_SIGNAL,
             {'data': rollout_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()
        timer_dict['Train Weights'] = time.time()

        # step 4: update the weights
        sampler_agent.set_weights(training_return['network_weights'])
        timer_dict['Assign Weights'] = time.time()

        # log and print the results
        log_results(training_return, timer_dict, mb_steps)

        if training_return['totalsteps'] > args.max_timesteps:
            break
        else:
            current_iteration += 1

    # end of training
    sampler_agent.end()
    trainer_tasks.put((parallel_util.END_SIGNAL, None))
Exemplo n.º 19
0
    def __init__(self, args, session, name_scope, observation_size,
                 action_size):

        env_info = env_register._ENV_INFO[args.task_name]

        self._image_width, self._image_height, self._image_channel = \
            env_info['image_width'], env_info['image_height'], \
            env_info['image_channel']

        super(policy_network, self).__init__(args, session, name_scope,
                                             observation_size, action_size)
        self._base_dir = init_path.get_abs_base_dir()
Exemplo n.º 20
0
    def __init__(self,
                 env_name='dm_humanoid-noise',
                 rand_seed=1234,
                 misc_info={}):
        self._base_path = init_path.get_abs_base_dir()

        super(env, self).__init__(env_name, rand_seed, misc_info)

        # the noise level
        if env_name in ['dm-humanoid-noise', 'cmu-humanoid-imitation']:
            self._noise_c = 0.01
        else:
            assert env_name in ['dm-humanoid']
            self._noise_c = 0
Exemplo n.º 21
0
    def __init__(self, env_name, rand_seed, misc_info):
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()

        # return the reset as the gym?
        if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym':
            self._reset_return_obs_only = True
            self.observation_space, self.action_space = \
                self._env.observation_space, self._env.action_space
            # it's possible some environments have different obs
            self.observation_space = \
                env_util.box(self._env_info['ob_size'], -1, 1)
        else:
            self._reset_return_obs_only = False
 def __init__(self, args, session, name_scope, observation_size,
              action_size):
     super(dynamics_network, self).__init__(args, session, name_scope,
                                            observation_size, action_size)
     self._base_dir = init_path.get_abs_base_dir()
     self._replay_x0 = {
         'data':
         np.zeros(
             [self.args.gps_init_state_replay_size,
              self._observation_size]),
         'cursor':
         0,
         'size':
         0
     }
    def __init__(self, args, session, name_scope, observation_size,
                 action_size):
        '''
            @input:
                @ob_placeholder:
                    if this placeholder is not given, we will make one in this
                    class.

                @trainable:
                    If it is set to true, then the policy weights will be
                    trained. It is useful when the class is a subnet which
                    is not trainable
        '''
        super(dynamics_network, self).__init__(args, session, name_scope,
                                               observation_size, action_size)
        self._base_dir = init_path.get_abs_base_dir()
Exemplo n.º 24
0
def main():
    parser = base_config.get_base_config()
    parser = rs_config.get_rs_config(parser)
    parser = il_config.get_il_config(parser)
    args = base_config.make_parser(parser)
    args = il_config.post_process_config(args)

    if args.write_log:
        args.log_path = logger.set_file_handler(path=args.output_dir,
                                                prefix='inverse_dynamics' +
                                                args.task,
                                                time_str=args.exp_id)

    print('Training starts at {}'.format(init_path.get_abs_base_dir()))

    train(args)
Exemplo n.º 25
0
 def __init__(self,
              args,
              network_type,
              task_queue,
              result_queue,
              name_scope='trainer'):
     self._num_gps_condition = 1 if args.gps_single_condition \
         else args.num_ilqr_traj
     # the base agent
     super(trainer, self).__init__(args=args,
                                   network_type=network_type,
                                   task_queue=task_queue,
                                   result_queue=result_queue,
                                   name_scope=name_scope)
     self._base_path = init_path.get_abs_base_dir()
     self._iteration = 0
Exemplo n.º 26
0
    def __init__(self, args, session, name_scope, observation_size,
                 action_size):
        self.args = args

        self._session = session
        self._name_scope = name_scope

        self._observation_size = observation_size
        self._action_size = action_size

        self._task_name = args.task_name
        self._network_shape = args.policy_network_shape

        self._npr = np.random.RandomState(args.seed)

        self._whitening_operator = {}
        self._whitening_variable = []
        self._base_dir = init_path.get_abs_base_dir()
Exemplo n.º 27
0
    def __init__(self, env_name, rand_seed, misc_info):
        assert env_name in ['gym_humanoid', 'gym_slimhumanoid', 'gym_nostopslimhumanoid']
        super(env, self).__init__(env_name, rand_seed, misc_info)
        self._base_path = init_path.get_abs_base_dir()

        self._len_qpos, self._len_qvel = \
            env_util.get_gym_q_info(self._env, self._current_version)

        # return the reset as the gym?
        if 'reset_type' in misc_info and misc_info['reset_type'] == 'gym':
            self._reset_return_obs_only = True
            self.observation_space, self.action_space = \
                self._env.observation_space, self._env.action_space
            # it's possible some environments have different obs
            self.observation_space = \
                env_util.box(self._env_info['ob_size'], -1, 1)
        else:
            self._reset_return_obs_only = False
Exemplo n.º 28
0
def main():
    parser = base_config.get_base_config()
    parser = metrpo_config.get_metrpo_config(parser)
    args = base_config.make_parser(parser)

    if args.write_log:
        logger.set_file_handler(path=args.output_dir,
                                prefix='mbrl-metrpo-' + args.task,
                                time_str=args.exp_id)
    print('Training starts at {}'.format(init_path.get_abs_base_dir()))
    from mbbl.trainer import metrpo_trainer
    from mbbl.sampler import singletask_metrpo_sampler
    from mbbl.worker import metrpo_worker
    from mbbl.network.dynamics.deterministic_forward_dynamics import dynamics_network
    from mbbl.network.policy.trpo_policy import policy_network
    from mbbl.network.reward.groundtruth_reward import reward_network

    train(metrpo_trainer, singletask_metrpo_sampler, metrpo_worker,
          dynamics_network, policy_network, reward_network, args)
Exemplo n.º 29
0
def load_expert_data(traj_data_name, traj_episode_num):
    # the start of the training
    traj_base_dir = init_path.get_abs_base_dir()

    if not traj_data_name.endswith('.npy'):
        traj_data_name = traj_data_name + '.npy'
    data_dir = os.path.join(traj_base_dir, traj_data_name)

    assert os.path.exists(data_dir), \
        logger.error('Invalid path: {}'.format(data_dir))
    expert_trajectory = np.load(data_dir, encoding="latin1")

    # choose only the top trajectories
    if len(expert_trajectory) > traj_episode_num:
        logger.warning('Using only %d trajs out of %d trajs' %
                       (traj_episode_num, len(expert_trajectory)))
    expert_trajectory = expert_trajectory[:min(traj_episode_num,
                                               len(expert_trajectory))]
    return expert_trajectory
Exemplo n.º 30
0
    def __init__(self, args, session, name_scope,
                 observation_size, action_size):
        '''
            @input:
                @ob_placeholder:
                    if this placeholder is not given, we will make one in this
                    class.

                @trainable:
                    If it is set to true, then the policy weights will be
                    trained. It is useful when the class is a subnet which
                    is not trainable
        '''
        super(reward_network, self).__init__(
            args, session, name_scope, observation_size, action_size
        )
        self._base_dir = init_path.get_abs_base_dir()
        # load the expert data
        self._expert_trajectory_obs = expert_data_util.load_expert_trajectory(
            self.args.expert_data_name, self.args.traj_episode_num
        )