예제 #1
0
def simulate_policy(args):
    with tf.Session() as sess:
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            # env = data['algo'].env
        else:
            policy = data['policy']
            # env = data['env']


        render = True
        random_arm_init = [0.0, 0.0]
        reward_shaping = True
        horizon = 1000
        env = normalize(
            CRLWrapper(
                # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    seed=True,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
                # )
            )
        )

        mean_reward = []
        # while True:
        with policy.deterministic(args.deterministic):
            for _ in range(args.trials):
                path = rollout(env,
                               policy,
                               max_path_length=horizon - 1,
                               animated=True,
                               speedup=1.5,
                               always_return_paths=True)

                mean_reward.append(np.sum(path["rewards"]))
        print("Average Return {}+/-{}".format(np.mean(mean_reward),
                                              np.std(mean_reward)))
예제 #2
0
def run_experiment(variant):
    sub_level_policies_paths = []
    # args = parse_args()
    args = arg()
    domain = ENVIRONMENTS[args.domain][args.task]
    if args.domain == 'sawyer-reach':
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        lower_goal_range = [-0.1, -0.1, -0.1]
        upper_goal_range = [0.1, 0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    domain(
                        # playable params
                        random_arm_init=random_arm_init,
                        lower_goal_range=lower_goal_range,
                        upper_goal_range=upper_goal_range,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    else:
        raise ValueError("Domain not available")

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )
    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=2e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler)
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
예제 #3
0
def run_experiment(param):
    random_arm_init = [-0.1, 0.1]
    lower_goal_range = [-0.1, -0.1, -0.1]
    upper_goal_range = [0.1, 0.1, 0.1]
    render = False
    reward_shaping = True
    horizon = 250
    env = normalize(
        CRLWrapper(
            # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    lower_goal_range=lower_goal_range,
                    upper_goal_range=upper_goal_range,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
            # )
        )
    )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }


    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 2e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=20,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
예제 #4
0
def simulate_policy(args):
    paths = []
    sub_level_policies = []
    sub_level_policies_paths = []

    render = True
    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]

        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = True

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.02, 0.02],
                        y_range=[-0.02, 0.02],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
    else:
        print("NO DOMAIN")

    with tf.Session() as sess:
        for p in range(0, len(sub_level_policies_paths)):
            path = sub_level_policies_paths[p]
            if path[:2] == 'ik':
                with tf.variable_scope(str(p), reuse=False):
                    policy_snapshot = IK_Policy(path[2])
                sub_level_policies.append(policy_snapshot)
            else:
                with tf.variable_scope(str(p), reuse=False):
                    policy_snapshot = joblib.load(sub_level_policies_paths[p])
                sub_level_policies.append(policy_snapshot["policy"])

        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            # env = data['algo'].env
        else:
            policy = data['policy']
            # env = data['env']

        mean_reward = []
        with policy.deterministic(args.deterministic):
            Npath = 0
            while Npath < args.trials:
                path = rollout(env,
                               policy,
                               sub_level_policies,
                               path_length=horizon - 1,
                               g=goal_size)
                # if np.sum(path["rewards"])>=args.reward_min:
                #     print(np.sum(path["rewards"]))
                #     paths.append(dict(
                #         observations= env.observation_space.flatten_n(path["observations"]),
                #         actions= env.observation_space.flatten_n(path["actions"]),
                #         rewards= tensor_utils.stack_tensor_list(path["rewards"]),
                #         env_infos= path["env_infos"],
                #         agent_infos= path["agent_infos"],
                #         goal = path["goal"]
                #     ))
                Npath += 1
                mean_reward.append(np.sum(path["rewards"]))
            print("Average Return {}+/-{}".format(np.mean(mean_reward),
                                                  np.std(mean_reward)))
            # fileName = osp.join(args.FilePath,'itr.pkl')
            # joblib.dump(paths,fileName, compress=3)
        return env
예제 #5
0
def run_experiment(variant):
    sub_level_policies_paths = []
    args = arg()

    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.01, 0.01],
                        y_range=[-0.01, 0.01],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 3000
    else:
        raise ValueError("Domain not available")

    if args.demo:
        pool = DemoReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )
    else:
        pool = SimpleReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )

    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)

    base_kwargs = dict(
        epoch_length=ep_length,
        n_epochs=5e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler,
        use_demos=args.demo,
    )
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
예제 #6
0
# reset env to get initial obs = initial state
render = False
random_arm_init = [-0.1, 0.1]
reward_shaping = True
horizon = 250
env = normalize(
    CRLWrapper(
        # IKWrapper(
        SawyerReach(
            # playable params
            random_arm_init=random_arm_init,
            seed=True,
            has_renderer=render,
            reward_shaping=reward_shaping,
            horizon=horizon,

            # constant params
            has_offscreen_renderer=False,
            use_camera_obs=False,
            use_object_obs=True,
            control_freq=100,
        )
        # )
    ))
obs = env.reset()
ik = inverse_kin(env)
start = obs[0:3]

points = fibonacci_sphere(0.1, 1000)
x_actions = np.array([points[i][0] for i in range(len(points))]) + start[0]
y_actions = np.array([points[i][1] for i in range(len(points))]) + start[1]