Exemplo n.º 1
0
def get_roboverse_env_from_params(environment_params):
    import roboverse

    use_predictive_model = environment_params['use_predictive_model']

    env_name = environment_params['env']
    randomize = environment_params['randomize_env']
    observation_mode = environment_params['obs']
    reward_type = environment_params['reward_type']
    single_obj_reward = environment_params['single_obj_reward']
    all_random = environment_params['all_random']
    trimodal_positions_choice = environment_params['trimodal_positions_choice']
    num_objects = environment_params['num_objects']


    base_env = roboverse.make(
        env_name, gui=False, randomize=randomize,
        observation_mode=observation_mode, reward_type=reward_type,
        single_obj_reward=single_obj_reward,
        normalize_and_flatten=False,
        num_objects=num_objects,
        all_random=all_random,
        trimodal_positions_choice=trimodal_positions_choice)

    if use_predictive_model:
        model_dir = environment_params['model_dir']
        num_execution_per_step = environment_params['num_execution_per_step']
        img_width = base_env.obs_img_dim
        env = PredictiveModelEnvWrapper(model_dir, num_execution_per_step, base_env=base_env, img_dim=img_width)
        return GymAdapter(domain=None, task=None, env=env)
    else:
        return GymAdapter(domain=None, task=None, env=base_env)
Exemplo n.º 2
0
def create_env(randomize, reward_type, env_index, single_obj_reward):

    model_dir_local = "/home/huihanl/precog_nick/logs/esp_train_results/2020-07/" \
                "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \
                ".basic_image_rnn.BasicImageRNNBijection"
    model_dir_server = "/nfs/kun1/users/huihanl/" \
                "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \
                ".basic_image_rnn.BasicImageRNNBijection"

    model_dir_aws = "/home/ubuntu/07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection.basic_image_rnn.BasicImageRNNBijection"
    num_execution_per_step = 2

    base_env = roboverse.make(
        "SawyerGraspOneV2-v0",
        gui=False,
        randomize=randomize,
        observation_mode="pixels_debug",
        reward_type=reward_type,
        single_obj_reward=single_obj_reward,
        normalize_and_flatten=True,
    )

    img_width, img_height = base_env.obs_img_dim, base_env.obs_img_dim

    env = PredictiveModelEnvWrapper(model_dir_aws,
                                    num_execution_per_step,
                                    base_env=base_env,
                                    img_dim=img_width)

    return env
Exemplo n.º 3
0
def load_widowx():
    '''
    Loads sim datasets (uses rlkit from cql-private)
    '''
    from rlkit.data_management.load_buffer import load_data_from_npy
    import roboverse

    data_folder_path = '/home/jonathanyang0127/minibullet/data'
    data_file_path = data_folder_path + \
            ('/may25_Widow250OneObjectGraspShed-v0_20K_save_all_noise_0.1_2021-05-25T19-27-54/'
            'may25_Widow250OneObjectGraspShed-v0_20K_save_all_noise_0.1_2021-05-25T19-27-54_20000.npy')
    variant = {
        'buffer': data_file_path,
        'env': 'Widow250OneObjectGraspTrain-v0'
    }

    expl_env = roboverse.make(variant['env'], transpose_image=True)
    replay_buffer = load_data_from_npy(variant, expl_env, ['image'])
    train = WidowXDataset(replay_buffer,
                          train=True,
                          normalize=True,
                          image_dims=(48, 48, 3))
    val = WidowXDataset(replay_buffer,
                        train=False,
                        normalize=True,
                        image_dims=(48, 48, 3))

    return train, val
Exemplo n.º 4
0
def experiment(variant):
    checkpoint_filepath = os.path.join(variant['checkpoint_dir'],
                                       'itr_{}.pkl'.format(
                                           variant['checkpoint_epoch']))
    checkpoint = torch.load(checkpoint_filepath)

    # the following does not work for Bullet envs yet
    # eval_env = checkpoint['evaluation/env']
    # expl_env = checkpoint['exploration/env']

    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env

    policy = checkpoint['trainer/trainer'].policy
    eval_policy = checkpoint['evaluation/policy']
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )

    observation_key = 'image'
    online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][
        'max_path_length']

    if variant['online_data_only']:
        replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env,
                                            observation_key=observation_key)
    else:
        replay_buffer = load_data_from_npy_chaining(
            variant, expl_env, observation_key,
            extra_buffer_size=online_buffer_size)

    trainer_kwargs = variant['trainer_kwargs']
    assert trainer_kwargs['min_q_weight'] > 0.
    trainer = checkpoint['trainer/trainer']
    trainer.min_q_weight = trainer_kwargs['min_q_weight']

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=False,
        **variant['algorithm_kwargs']
    )
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 5
0
def main(args):

    timestamp = get_timestamp()
    if osp.exists(NFS_PATH):
        data_save_path = osp.join(NFS_PATH, args.save_directory)
    else:
        data_save_path = osp.join(__file__, "../..", "data", args.save_directory)
    data_save_path = osp.abspath(data_save_path)
    if not osp.exists(data_save_path):
        os.makedirs(data_save_path)

    env = roboverse.make(args.env_name,
                         gui=args.gui,
                         transpose_image=False)

    data = []
    assert args.policy_name in policies.keys(), f"The policy name must be one of: {policies.keys()}"
    assert args.accept_trajectory_key in env.get_info().keys(), \
        f"""The accept trajectory key must be one of: {env.get_info().keys()}"""
    policy_class = policies[args.policy_name]
    policy = policy_class(env)
    num_success = 0
    num_saved = 0
    num_attempts = 0
    accept_trajectory_key = args.accept_trajectory_key

    progress_bar = tqdm(total=args.num_trajectories)

    while num_saved < args.num_trajectories:
        num_attempts += 1
        traj, success, num_steps = collect_one_traj(
            env, policy, args.num_timesteps, args.noise,
            accept_trajectory_key)

        if success:
            if args.gui:
                print("num_timesteps: ", num_steps)
            data.append(traj)
            num_success += 1
            num_saved += 1
            progress_bar.update(1)
        elif args.save_all:
            data.append(traj)
            num_saved += 1
            progress_bar.update(1)

        if args.gui:
            print("success rate: {}".format(num_success/(num_attempts)))

    progress_bar.close()
    print("success rate: {}".format(num_success / (num_attempts)))
    path = osp.join(data_save_path, "scripted_{}_{}.npy".format(
        args.env_name, timestamp))
    print(path)
    np.save(path, data)
Exemplo n.º 6
0
def create_env_action(randomize, reward_type, env_index, single_obj_reward, trimodal_positions):

    base_env = roboverse.make(
        "SawyerGraspOneV2-v0", gui=False, randomize=randomize,
        observation_mode="pixels_debug", reward_type=reward_type,
        single_obj_reward=single_obj_reward,
        normalize_and_flatten=True,
        all_random=True,
        trimodal_positions=trimodal_positions)

    return base_env
Exemplo n.º 7
0
def keyboard_control(args):
    env = roboverse.make(args.env_name, gui=True)

    while True:
        take_action = False
        action = np.array([0, 0, 0, 0, 0, 0, 0], dtype='float32')
        keys = bullet.p.getKeyboardEvents()
        for qKey in keys:
            if qKey in KEY_TO_ACTION_MAPPING.keys():
                action += KEY_TO_ACTION_MAPPING[qKey]
                take_action = True
            elif qKey in ENV_COMMANDS.keys():
                ENV_COMMANDS[qKey](env)
                take_action = False

        if take_action:
            obs, rew, done, info = env.step(action)
            print(rew)
        time.sleep(0.1)
Exemplo n.º 8
0
def create_env(randomize, reward_type, env_index, single_obj_reward):

    model_dir_local = "/home/huihanl/precog_nick/logs/esp_train_results/2020-07/" \
                "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \
                ".basic_image_rnn.BasicImageRNNBijection"
    model_dir_server = "/nfs/kun1/users/huihanl/" \
                "07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection" \
                ".basic_image_rnn.BasicImageRNNBijection"

    model_dir_aws = "/home/ubuntu/07-23-19-52-51_dataset.sawyer_dataset_no_append.SawyerDatasetNoAppend_bijection.basic_image_rnn.BasicImageRNNBijection"
    num_execution_per_step = 2

    trimodal_positions6 = [
        (0.8187814400771692, 0.21049907010351596, -0.3415106684025205),
        (0.739567302423451, 0.14341819851789023, -0.341380192135101),
        (0.6694743281369817, 0.26676337932361205, -0.3440640126774397)
    ]

    trimodal_positions5 = [
        (0.6838513781345804, 0.13232607273283675, -0.34202212957838085),
        (0.765710933262148, 0.17467922332524338, -0.3351583030520283),
        (0.6789658820114347, 0.2548283056922718, -0.34103265910465325)
    ]

    trimodal_positions0 = [
        (0.7044727231056753, 0.2885882337328971, -0.3434554430497813),
        (0.6950925950841189, 0.10769150255843486, -0.3408203519762647),
        (0.8303787805809874, 0.12114265344994643, -0.34381720097869056)
    ]

    trimodal_positions1 = [
        (0.7543401984506464, 0.21230734165805112, -0.3400822998707757),
        (0.7250127303935135, 0.1172465013108163, -0.34050488361050935),
        (0.668601621056849, 0.20450849328681078, -0.34356676661067254)
    ]

    trimodal_positions2 = [
        (0.6623195604074853, 0.1420046836817064, -0.34036979015824237),
        (0.7490314063602679, 0.11150642565578632, -0.34010550517840776),
        (0.8313050761244212, 0.186704691256355, -0.3444288731959656)
    ]

    trimodal_positions3 = [
        (0.702806187582968, 0.282862951083425, -0.34329308543453),
        (0.8053413878164436, 0.15063075895870554, -0.34222136237585643),
        (0.6598757532001869, 0.10674964260605753, -0.34047814967568574)
    ]

    trimodal_positions4 = [
        (0.7935222824622936, 0.19097678049219627, -0.336295087280328),
        (0.6742503469035555, 0.1988108314637719, -0.3439745727367933),
        (0.6661682273867254, 0.09909463325237348, -0.3399316482536911)
    ]

    trimodal_positions_choice = [
        trimodal_positions0, trimodal_positions1, trimodal_positions2,
        trimodal_positions3, trimodal_positions4, trimodal_positions5,
        trimodal_positions6
    ]

    trimodal_positions = trimodal_positions_choice[env_index]

    base_env = roboverse.make("SawyerGraspOneV2-v0",
                              gui=False,
                              randomize=randomize,
                              observation_mode="pixels_debug",
                              reward_type=reward_type,
                              single_obj_reward=single_obj_reward,
                              normalize_and_flatten=True,
                              all_random=True,
                              trimodal_positions=trimodal_positions)

    img_width, img_height = base_env.obs_img_dim, base_env.obs_img_dim

    env = PredictiveModelEnvWrapper(model_dir_aws,
                                    num_execution_per_step,
                                    base_env=base_env,
                                    img_dim=img_width)

    return env
Exemplo n.º 9
0
        info = self.get_info()
        return bool(info['button_pressed_percentage'] >
                    self.button_pressed_success_thresh)

    def get_reward(self, info):
        if self.reward_type == "button_press":
            return float(info['button_pressed_success'])
        elif self.reward_type == "grasp":
            return float(info['grasp_success_target'])
        else:
            raise NotImplementedError


if __name__ == "__main__":
    env = roboverse.make("Widow250RandPosButtonPressTwoObjGrasp-v0",
                         gui=True,
                         transpose_image=False)
    import time
    env.reset()
    # import IPython; IPython.embed()

    for j in range(5):
        object_utils.pop_up_button(env.objects['button'])
        time.sleep(1)
        object_utils.push_down_button(env.objects['button'])
        time.sleep(1)
        object_utils.pop_up_button(env.objects['button'])
        for i in range(20):
            obs, rew, done, info = env.step(
                np.asarray([-0.05, 0., 0., 0., 0., 0.5, 0.]))
            print("reward", rew, "info", info)
Exemplo n.º 10
0
    def get_info(self):
        info = super(Widow250DoubleDrawerEnv, self).get_info()
        info['drawer_top_x_pos'] = self.get_drawer_pos("drawer_top")[0]
        info['drawer_top_opened_percentage'] = \
            self.get_drawer_opened_percentage("drawer_top")
        info['drawer_top_closed_success'] = info["drawer_top_opened_percentage"] \
            < self.drawer_closed_success_thresh
        return info

    def is_top_drawer_closed(self):
        info = self.get_info()
        return info['drawer_top_closed_success']


if __name__ == "__main__":
    env = roboverse.make('Widow250DoubleDrawerCloseOpenNeutral-v0',
                         gui=True,
                         transpose_image=False)
    import time
    env.reset()
    # import IPython; IPython.embed()

    for j in range(5):
        for i in range(20):
            obs, rew, done, info = env.step(
                np.asarray([-0.05, 0., 0., 0., 0., 0.5, 0., 0.]))
            print("reward", rew, "info", info)
            time.sleep(0.1)
        env.reset()
Exemplo n.º 11
0
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--num-trajectories", type=int, required=True)
    parser.add_argument("-t", "--num-timesteps", type=int, required=True)
    parser.add_argument("-e", "--env-name", type=str, required=True)
    parser.add_argument("--task-name", type=str, required=True)
    args = parser.parse_args()

    timestamp = timestamp()
    data_save_path = os.path.join(__file__, "../..", 'data', timestamp)
    data_save_path = os.path.abspath(data_save_path)
    if not os.path.exists(data_save_path):
        os.makedirs(data_save_path)

    data = []
    env = roboverse.make(args.env_name,
                         gui=True,
                         control_mode='discrete_gripper')
    env.reset()

    for j in tqdm(range(args.num_trajectories)):
        success, images, traj = collect_one_trajectory(env, args.num_timesteps)
        while success != 'y' and success != 'Y':
            print("failed for trajectory {}, collect again".format(j))
            success, images, traj = collect_one_trajectory(
                env, args.num_timesteps)
        data.append(traj)

        if j % 50 == 0:
            path = os.path.join(
                data_save_path,
                "{}_{}_{}_{}.npy".format(args.env_name, args.task_name,
Exemplo n.º 12
0
def experiment(variant):

    expl_env = roboverse.make(variant['env'], gui=False, randomize=True,
                              observation_mode=variant['obs'], reward_type='shaped',
                              transpose_image=True)

    eval_env = expl_env
    img_width, img_height = eval_env.image_shape
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
        added_fc_input_size=0,
        output_conv_channels=True,
        output_size=None,
    )

    qf_cnn = CNN(**cnn_params)
    qf_obs_processor = nn.Sequential(
        qf_cnn,
        Flatten(),
    )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = (
            action_dim + qf_cnn.conv_output_flat_size
    )
    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    target_qf_obs_processor = nn.Sequential(
        target_qf_cnn,
        Flatten(),
    )

    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1
    target_qf_kwargs['input_size'] = (
            action_dim + target_qf_cnn.conv_output_flat_size
    )

    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        policy_cnn,
        Flatten(),
    )

    policy = GaussianMixtureObsProcessorPolicy(
        obs_dim=policy_cnn.conv_output_flat_size,
        action_dim=action_dim,
        obs_processor=policy_obs_processor,
        **variant['policy_kwargs']
    )

    buffer_policy = GaussianMixtureObsProcessorPolicy(
        obs_dim=policy_cnn.conv_output_flat_size,
        action_dim=action_dim,
        obs_processor=policy_obs_processor,
        **variant['policy_kwargs']
    )

    # policy = TanhGaussianPolicyAdapter(
    #     policy_obs_processor,
    #     policy_cnn.conv_output_flat_size,
    #     action_dim,
    #     **variant['policy_kwargs']
    # )

    # buffer_policy = TanhGaussianPolicyAdapter(
    #     policy_obs_processor,
    #     policy_cnn.conv_output_flat_size,
    #     action_dim,
    #     **variant['policy_kwargs']
    # )

    observation_key = 'image'
    replay_buffer = ObsDictReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        observation_key=observation_key,
    )

    trainer = AWRSACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        buffer_policy=buffer_policy,
        **variant['trainer_kwargs']
    )

    expl_policy = policy
    expl_path_collector = ObsDictPathCollector(
        expl_env,
        expl_policy,
        observation_key=observation_key,
        **variant['expl_path_collector_kwargs']
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = ObsDictPathCollector(
        eval_env,
        eval_policy,
        observation_key=observation_key,
        **variant['eval_path_collector_kwargs']
    )

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        num_epochs=variant['num_epochs'],
        num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
        num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
        num_trains_per_train_loop=variant['num_trains_per_train_loop'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
    )

    algorithm.to(ptu.device)

    demo_train_buffer = ObsDictReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        observation_key=observation_key,
    )

    demo_test_buffer = ObsDictReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        observation_key=observation_key,
    )


    path_loader_kwargs = variant.get("path_loader_kwargs", {})

    video_func = VideoSaveFunctionBullet(variant)
    algorithm.post_train_funcs.append(video_func)

    save_paths = None  # FIXME(avi)
    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            **path_loader_kwargs
        )
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('save_pretrained_algorithm', False):
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get('train_rl', True):
        algorithm.train()
Exemplo n.º 13
0
def experiment(variant):
    checkpoint_filepath = os.path.join(variant['checkpoint_dir'],
                                       'itr_{}.pkl'.format(
                                           variant['checkpoint_epoch']))
    checkpoint = torch.load(checkpoint_filepath)


    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env

    action_dim = eval_env.action_space.low.size
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=48,
        input_height=48,
        input_channels=3,
        output_size=1,
        added_fc_input_size=action_dim,
    )
    qf1 = ConcatCNN(**cnn_params)
    qf2 = ConcatCNN(**cnn_params)
    target_qf1 = ConcatCNN(**cnn_params)
    target_qf2 = ConcatCNN(**cnn_params)

    policy = checkpoint['evaluation/policy']
    eval_policy = MakeDeterministic(policy)

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )

    observation_key = 'image'
    online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][
        'max_path_length']

    if variant['online_data_only']:
        replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env,
                                            observation_key=observation_key)
    else:
        replay_buffer = load_data_from_npy_chaining(
            variant, expl_env, observation_key,
            extra_buffer_size=online_buffer_size)

    trainer_kwargs = variant['trainer_kwargs']
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **trainer_kwargs
    )

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=False,
        **variant['algorithm_kwargs']
    )
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 14
0
    def __init__(self,
                 env_name,
                 checkpoint_filename,
                 num_timesteps_per_traj,
                 accept_trajectory_key,
                 video_save_dir,
                 save_all,
                 save_images,
                 add_robot_view,
                 noise=0.1):
        self.env_name = env_name
        self.num_timesteps_per_traj = num_timesteps_per_traj
        self.accept_trajectory_key = accept_trajectory_key
        self.noise = noise
        chkpt_split = checkpoint_filename.split('/')
        logdir = chkpt_split[-2] + '-' + chkpt_split[-1].split('.')[0]
        self.video_save_dir = os.path.join(video_save_dir, logdir)
        self.save_all = save_all
        if save_images:
            self.save_function = self.save_images
        else:
            self.save_function = self.save_video

        self.image_size = 512
        self.add_robot_view = add_robot_view

        if not os.path.exists(self.video_save_dir):
            os.makedirs(self.video_save_dir)

        # cam for pick and place (front view)
        # self.camera_target_pos = [0.55, 0.1, -0.30]
        # self.camera_roll = 0.0
        # self.camera_pitch = -30.0
        # self.camera_yaw = 180.0
        # self.camera_distance = 0.50

        # drawer cam (front view)
        # self.camera_target_pos = [0.60, 0.05, -0.30]
        # self.camera_roll = 0.0
        # self.camera_pitch = -30.0
        # self.camera_yaw = 180.0
        # self.camera_distance = 0.50

        # drawer cam (canonical view)
        self.camera_target_pos = [0.55, 0., -0.30]
        self.camera_roll = 0.0
        self.camera_pitch = -30.0
        self.camera_yaw = 150.0
        self.camera_distance = 0.64

        self.view_matrix_args = dict(target_pos=self.camera_target_pos,
                                     distance=self.camera_distance,
                                     yaw=self.camera_yaw,
                                     pitch=self.camera_pitch,
                                     roll=self.camera_roll,
                                     up_axis_index=2)
        self.view_matrix = bullet.get_view_matrix(**self.view_matrix_args)
        self.projection_matrix = bullet.get_projection_matrix(
            self.image_size, self.image_size)
        # end camera settings
        self.env = roboverse.make(self.env_name,
                                  gui=False,
                                  transpose_image=True)

        checkpoint = torch.load(checkpoint_filename)
        self.policy = checkpoint['evaluation/policy'].cpu()
        self.checkpoint_name = 'test123'
        self.trajectories_collected = 0
Exemplo n.º 15
0
def experiment(variant):
    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped')
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    obs_dim = eval_env.observation_space.shape[0]

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M],  # Making it easier to visualize
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMdpPathCollector(expl_env, )

    with open(variant['buffer'], 'rb') as f:
        replay_buffer = pickle.load(f)

    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         behavior_policy=None,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=True,
        batch_rl=variant['load_buffer'],
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 16
0
def create_env_action(randomize, reward_type, env_index, single_obj_reward):

    trimodal_positions6 = [
        (0.8187814400771692, 0.21049907010351596, -0.3415106684025205),
        (0.739567302423451, 0.14341819851789023, -0.341380192135101),
        (0.6694743281369817, 0.26676337932361205, -0.3440640126774397)
    ]

    trimodal_positions5 = [
        (0.6838513781345804, 0.13232607273283675, -0.34202212957838085),
        (0.765710933262148, 0.17467922332524338, -0.3351583030520283),
        (0.6789658820114347, 0.2548283056922718, -0.34103265910465325)
    ]

    trimodal_positions0 = [
        (0.7044727231056753, 0.2885882337328971, -0.3434554430497813),
        (0.6950925950841189, 0.10769150255843486, -0.3408203519762647),
        (0.8303787805809874, 0.12114265344994643, -0.34381720097869056)
    ]

    trimodal_positions1 = [
        (0.7543401984506464, 0.21230734165805112, -0.3400822998707757),
        (0.7250127303935135, 0.1172465013108163, -0.34050488361050935),
        (0.668601621056849, 0.20450849328681078, -0.34356676661067254)
    ]

    trimodal_positions2 = [
        (0.6623195604074853, 0.1420046836817064, -0.34036979015824237),
        (0.7490314063602679, 0.11150642565578632, -0.34010550517840776),
        (0.8313050761244212, 0.186704691256355, -0.3444288731959656)
    ]

    trimodal_positions3 = [
        (0.702806187582968, 0.282862951083425, -0.34329308543453),
        (0.8053413878164436, 0.15063075895870554, -0.34222136237585643),
        (0.6598757532001869, 0.10674964260605753, -0.34047814967568574)
    ]

    trimodal_positions4 = [
        (0.7935222824622936, 0.19097678049219627, -0.336295087280328),
        (0.6742503469035555, 0.1988108314637719, -0.3439745727367933),
        (0.6661682273867254, 0.09909463325237348, -0.3399316482536911)
    ]

    trimodal_positions_choice = [
        trimodal_positions0, trimodal_positions1, trimodal_positions2,
        trimodal_positions3, trimodal_positions4, trimodal_positions5,
        trimodal_positions6
    ]

    trimodal_positions = trimodal_positions_choice[env_index]

    base_env = roboverse.make("SawyerGraspOneV2-v0",
                              gui=False,
                              randomize=randomize,
                              observation_mode="pixels_debug",
                              reward_type=reward_type,
                              single_obj_reward=single_obj_reward,
                              normalize_and_flatten=True,
                              all_random=True,
                              trimodal_positions=trimodal_positions)

    return base_env
Exemplo n.º 17
0
    else:
        raise NotImplementedError

    string_end = args.checkpoint.find('-v0')
    env_string = args.checkpoint[string_start:string_end]

    if 'pixel' in args.checkpoint:
        obs_mode = 'pixels'
    elif 'state' in args.checkpoint:
        obs_mode = 'state'
    else:
        raise NotImplementedError

    print('using env: {}'.format(env_string))
    env = roboverse.make('{}-v0'.format(env_string),
                         gui=True,
                         observation_mode=obs_mode,
                         transpose_image=True)

    return_list = []
    success_list = []
    max_path_length = 30

    for _ in range(100):

        obs = env.reset()
        if obs_mode == 'pixels':
            obs = obs['image']
        ret = 0
        for _ in range(max_path_length):
            action, _ = policy.get_action(obs)
            # print(action)
Exemplo n.º 18
0
def experiment(variant):
    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode=variant['obs'],
                              reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env
    img_width, img_height = eval_env.image_shape
    num_channels = 3
    action_dim = int(np.prod(eval_env.action_space.shape))
    # obs_dim = 11

    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
        added_fc_input_size=0,
        output_conv_channels=True,
        output_size=None,
    )

    qf_cnn = CNN(**cnn_params)
    qf_obs_processor = nn.Sequential(
        qf_cnn,
        Flatten(),
    )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)
    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    target_qf_obs_processor = nn.Sequential(
        target_qf_cnn,
        Flatten(),
    )

    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1
    target_qf_kwargs['input_size'] = (action_dim +
                                      target_qf_cnn.conv_output_flat_size)

    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        policy_cnn,
        Flatten(),
    )
    policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                       policy_cnn.conv_output_flat_size,
                                       action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    observation_key = 'image'

    eval_path_collector = ObsDictPathCollector(
        eval_env,
        eval_policy,
        observation_key=observation_key,
        **variant['eval_path_collector_kwargs'])

    expl_path_collector = CustomObsDictPathCollector(
        expl_env,
        observation_key=observation_key,
    )

    with open(variant['buffer'], 'rb') as f:
        replay_buffer = pickle.load(f)

    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         behavior_policy=None,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=True,
        batch_rl=variant['load_buffer'],
        **variant['algorithm_kwargs'])

    video_func = VideoSaveFunctionBullet(variant)
    algorithm.post_train_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 19
0
def experiment(variant):

    expl_env = roboverse.make(variant['env'], gui=False, randomize=True,
                              observation_mode='state', reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    state_dim = 11

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = action_dim + state_dim
    qf1 = MlpQf(**qf_kwargs)
    qf2 = MlpQf(**qf_kwargs)

    target_qf_kwargs = copy.deepcopy(qf_kwargs)
    target_qf1 = MlpQf(**target_qf_kwargs)
    target_qf2 = MlpQf(**target_qf_kwargs)

    policy_kwargs = copy.deepcopy(variant['policy_kwargs'])
    policy_kwargs['action_dim'] = action_dim
    policy_kwargs['obs_dim'] = state_dim
    policy = TanhGaussianPolicy(**policy_kwargs)

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        **variant['eval_path_collector_kwargs']
    )

    with open(variant['buffer'], 'rb') as f:
        replay_buffer = pickle.load(f)

    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )

    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
        **variant['expl_path_collector_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        q_learning_alg=False,  # should be false for SAC
        batch_rl=variant['batch_rl'],
        **variant['algo_kwargs']
    )
    dump_buffer_func = BufferSaveFunction(variant)
    # algorithm.post_train_funcs.append(dump_buffer_func)

    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):

    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=variant['randomize_env'],
                              observation_mode=variant['obs'],
                              reward_type='shaped',
                              transpose_image=True)

    if variant['obs'] == 'pixels_debug':
        robot_state_dims = 11
    elif variant['obs'] == 'pixels':
        robot_state_dims = 4
    else:
        raise NotImplementedError

    expl_env = FlatEnv(expl_env,
                       use_robot_state=variant['use_robot_state'],
                       robot_state_dims=robot_state_dims)
    eval_env = expl_env

    img_width, img_height = eval_env.image_shape
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
    )
    if variant['use_robot_state']:
        cnn_params.update(
            added_fc_input_size=robot_state_dims,
            output_conv_channels=False,
            hidden_sizes=[400, 400],
            output_size=200,
        )
    else:
        cnn_params.update(
            added_fc_input_size=0,
            output_conv_channels=True,
            output_size=None,
        )
    qf_cnn = CNN(**cnn_params)

    if variant['use_robot_state']:
        qf_obs_processor = qf_cnn
    else:
        qf_obs_processor = nn.Sequential(
            qf_cnn,
            Flatten(),
        )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1

    if variant['use_robot_state']:
        qf_kwargs['input_size'] = (action_dim + qf_cnn.output_size)
    else:
        qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)

    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    if variant['use_robot_state']:
        target_qf_obs_processor = target_qf_cnn
    else:
        target_qf_obs_processor = nn.Sequential(
            target_qf_cnn,
            Flatten(),
        )

    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1

    if variant['use_robot_state']:
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.output_size)
    else:
        target_qf_kwargs['input_size'] = (action_dim +
                                          target_qf_cnn.conv_output_flat_size)

    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    if variant['use_robot_state']:
        policy_obs_processor = policy_cnn
    else:
        policy_obs_processor = nn.Sequential(
            policy_cnn,
            Flatten(),
        )

    if variant['use_robot_state']:
        policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                           policy_cnn.output_size, action_dim,
                                           **variant['policy_kwargs'])
    else:
        policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                           policy_cnn.conv_output_flat_size,
                                           action_dim,
                                           **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    else:
        raise NotImplementedError

    video_func = VideoSaveFunctionBullet(variant)
    algorithm.post_train_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 21
0
def experiment(variant):
    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env
    action_dim = int(np.prod(eval_env.action_space.shape))
    obs_dim = 11

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy_class = variant.get("policy_class", TanhGaussianPolicy)
    policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    buffer_policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )

    trainer = AWRSACTrainer(env=eval_env,
                            policy=policy,
                            qf1=qf1,
                            qf2=qf2,
                            target_qf1=target_qf1,
                            target_qf2=target_qf2,
                            buffer_policy=buffer_policy,
                            **variant['trainer_kwargs'])

    expl_policy = policy
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )

    replay_buffer_kwargs = dict(
        max_replay_buffer_size=variant['replay_buffer_size'],
        env=expl_env,
    )

    replay_buffer = EnvReplayBuffer(**replay_buffer_kwargs)

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        num_epochs=variant['num_epochs'],
        num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'],
        num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'],
        num_trains_per_train_loop=variant['num_trains_per_train_loop'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
    )

    algorithm.to(ptu.device)

    demo_train_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )
    demo_test_buffer = EnvReplayBuffer(**replay_buffer_kwargs, )

    path_loader_kwargs = variant.get("path_loader_kwargs", {})

    save_paths = None  # FIXME(avi)
    if variant.get('save_paths', False):
        algorithm.post_train_funcs.append(save_paths)
    if variant.get('load_demos', False):
        path_loader_class = variant.get('path_loader_class', MDPPathLoader)
        path_loader = path_loader_class(trainer,
                                        replay_buffer=replay_buffer,
                                        demo_train_buffer=demo_train_buffer,
                                        demo_test_buffer=demo_test_buffer,
                                        **path_loader_kwargs)
        path_loader.load_demos()
    if variant.get('pretrain_policy', False):
        trainer.pretrain_policy_with_bc()
    if variant.get('pretrain_rl', False):
        trainer.pretrain_q_with_bc_data()
    if variant.get('save_pretrained_algorithm', False):
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))
    if variant.get('train_rl', True):
        algorithm.train()
Exemplo n.º 22
0
def experiment(variant):
    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env
    action_dim = eval_env.action_space.low.size

    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=48,
        input_height=48,
        input_channels=3,
        output_size=1,
        added_fc_input_size=action_dim,
    )

    cnn_params.update(
        output_size=256,
        added_fc_input_size=0,
        hidden_sizes=[1024, 512],
    )

    policy_obs_processor = CNN(**cnn_params)
    policy = TanhGaussianPolicy(
        obs_dim=cnn_params['output_size'],
        action_dim=action_dim,
        hidden_sizes=[256, 256, 256],
        obs_processor=policy_obs_processor,
    )

    if variant['stoch_eval_policy']:
        eval_policy = policy
    else:
        eval_policy = MakeDeterministic(policy)

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(
        eval_env,
    )

    observation_key = 'image'
    replay_buffer = load_data_from_npy_chaining(
        variant, expl_env, observation_key)

    trainer = BCTrainer(
        env=eval_env,
        policy=policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=True,
        **variant['algorithm_kwargs']
    )
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
    def __init__(self,
                 env_name,
                 scripted_policy_name,
                 num_timesteps_per_traj,
                 accept_trajectory_key,
                 video_save_dir,
                 save_all,
                 save_images,
                 add_robot_view,
                 noise=0.1):
        self.env_name = env_name
        self.num_timesteps_per_traj = num_timesteps_per_traj
        self.accept_trajectory_key = accept_trajectory_key
        self.noise = noise
        self.video_save_dir = video_save_dir
        self.save_all = save_all
        if save_images:
            self.save_function = self.save_images
        else:
            self.save_function = self.save_video

        self.image_size = 512
        self.add_robot_view = add_robot_view

        if not os.path.exists(self.video_save_dir):
            os.makedirs(self.video_save_dir)
        # camera settings (default)
        # self.camera_target_pos = [0.57, 0.2, -0.22]
        # self.camera_roll = 0.0
        # self.camera_pitch = -10
        # self.camera_yaw = 215
        # self.camera_distance = 0.4

        # drawer cam (front view)
        # self.camera_target_pos = [0.60, 0.05, -0.30]
        # self.camera_roll = 0.0
        # self.camera_pitch = -30.0
        # self.camera_yaw = 180.0
        # self.camera_distance = 0.50

        # drawer cam (canonical view)
        self.camera_target_pos = [0.55, 0., -0.30]
        self.camera_roll = 0.0
        self.camera_pitch = -30.0
        self.camera_yaw = 150.0
        self.camera_distance = 0.64

        self.view_matrix_args = dict(target_pos=self.camera_target_pos,
                                     distance=self.camera_distance,
                                     yaw=self.camera_yaw,
                                     pitch=self.camera_pitch,
                                     roll=self.camera_roll,
                                     up_axis_index=2)
        self.view_matrix = bullet.get_view_matrix(**self.view_matrix_args)
        self.projection_matrix = bullet.get_projection_matrix(
            self.image_size, self.image_size)
        # end camera settings
        self.env = roboverse.make(self.env_name,
                                  gui=False,
                                  transpose_image=False)
        assert scripted_policy_name in policies.keys()
        self.policy_name = scripted_policy_name
        policy_class = policies[scripted_policy_name]
        self.scripted_policy = policy_class(self.env)
        self.trajectories_collected = 0
Exemplo n.º 24
0
def experiment(variant):

    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped')
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    state_dim = obs_dim = 11
    M = 256

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = action_dim + state_dim
    qf1 = MlpQf(**qf_kwargs)
    qf2 = MlpQf(**qf_kwargs)

    target_qf_kwargs = copy.deepcopy(qf_kwargs)
    target_qf1 = MlpQf(**target_qf_kwargs)
    target_qf2 = MlpQf(**target_qf_kwargs)

    policy_kwargs = copy.deepcopy(variant['policy_kwargs'])
    policy_kwargs['action_dim'] = action_dim
    policy_kwargs['obs_dim'] = state_dim
    policy = TanhGaussianPolicy(**policy_kwargs)

    vae_policy = VAEPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        latent_dim=action_dim * 2,
    )

    eval_path_collector = CustomMdpPathCollector(
        eval_env, save_images=True, **variant['eval_path_collector_kwargs'])

    vae_eval_path_collector = CustomMdpPathCollector(
        eval_env,
        max_num_epoch_paths_saved=5,
        save_images=True,
    )

    with open(variant['buffer'], 'rb') as f:
        replay_buffer = pickle.load(f)

    trainer = BEARTrainer(env=eval_env,
                          policy=policy,
                          qf1=qf1,
                          qf2=qf2,
                          target_qf1=target_qf1,
                          target_qf2=target_qf2,
                          vae=vae_policy,
                          **variant['trainer_kwargs'])

    expl_path_collector = MdpPathCollector(
        expl_env, policy, **variant['expl_path_collector_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        vae_evaluation_data_collector=vae_eval_path_collector,
        replay_buffer=replay_buffer,
        q_learning_alg=True,
        batch_rl=variant['batch_rl'],
        **variant['algo_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 25
0
def experiment(variant):

    expl_env = roboverse.make(variant['env'],
                              gui=False,
                              randomize=True,
                              observation_mode='state',
                              reward_type='shaped',
                              transpose_image=True)
    eval_env = expl_env

    action_dim = int(np.prod(eval_env.action_space.shape))
    state_dim = eval_env.observation_space.shape[0]

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = action_dim + state_dim
    qf1 = MlpQf(**qf_kwargs)
    qf2 = MlpQf(**qf_kwargs)

    target_qf_kwargs = copy.deepcopy(qf_kwargs)
    target_qf1 = MlpQf(**target_qf_kwargs)
    target_qf2 = MlpQf(**target_qf_kwargs)

    policy_kwargs = copy.deepcopy(variant['policy_kwargs'])
    policy_kwargs['action_dim'] = action_dim
    policy_kwargs['obs_dim'] = state_dim
    policy = TanhGaussianPolicy(**policy_kwargs)

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])

    if variant['scripted_policy']:
        if 'V3-v0' in variant['env']:
            scripted_policy = GraspV3ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        elif 'V4-v0' in variant['env']:
            scripted_policy = GraspV4ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        elif 'V5-v0' in variant['env']:
            scripted_policy = GraspV5ScriptedPolicy(
                expl_env, noise_std=variant['scripted_noise_std'])
        else:
            raise NotImplementedError
    else:
        scripted_policy = None

    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
            optional_expl_policy=scripted_policy,
            optional_expl_probability_init=0.5,
            **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env,
            policy,
            optional_expl_policy=scripted_policy,
            optional_expl_probability=0.9,
            **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])

    dump_buffer_func = BufferSaveFunction(variant)
    # algorithm.post_train_funcs.append(dump_buffer_func)

    algorithm.to(ptu.device)
    algorithm.train()
Exemplo n.º 26
0
def experiment(variant):
    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env
    action_dim = eval_env.action_space.low.size

    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=48,
        input_height=48,
        input_channels=3,
        output_size=1,
        added_fc_input_size=action_dim,
    )
    qf1 = ConcatCNN(**cnn_params)
    qf2 = ConcatCNN(**cnn_params)
    target_qf1 = ConcatCNN(**cnn_params)
    target_qf2 = ConcatCNN(**cnn_params)

    cnn_params.update(
        output_size=256,
        added_fc_input_size=0,
        hidden_sizes=[1024, 512],
    )

    policy_obs_processor = CNN(**cnn_params)
    policy = TanhGaussianPolicy(
        obs_dim=cnn_params['output_size'],
        action_dim=action_dim,
        hidden_sizes=[256, 256, 256],
        obs_processor=policy_obs_processor,
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(eval_env, )

    observation_key = 'image'
    replay_buffer = load_data_from_npy_chaining(variant, expl_env,
                                                observation_key)

    # Translate 0/1 rewards to +4/+10 rewards.
    if variant['use_positive_rew']:
        if set(np.unique(replay_buffer._rewards)).issubset({0, 1}):
            replay_buffer._rewards = replay_buffer._rewards * 6.0
            replay_buffer._rewards = replay_buffer._rewards + 4.0
        assert set(np.unique(replay_buffer._rewards)).issubset(
            set(6.0 * np.array([0, 1]) + 4.0))

    trainer = CQLTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=True,
        **variant['algorithm_kwargs'])
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()