Пример #1
0
def get_r_prior(prior, reward_center, std):
    check_in("prior", prior, ("gaussian", "laplace", "uniform"))
    if prior == "gaussian":
        return NormalDistribution(reward_center, std)
    elif prior == "laplace":
        return LaplaceDistribution(reward_center, std)
    return None
Пример #2
0
        def decoder(state_sample, observation_dist="gaussian"):
            """Compute the data distribution of an observation from its state [1]."""
            check_in(
                "observation_dist",
                observation_dist,
                ("gaussian", "laplace", "bernoulli", "multinomial"),
            )

            timesteps = tf.shape(state_sample)[1]

            if self.pixel_observations:
                # original decoder from [1] for deepmind lab envs
                hidden = tf.layers.dense(state_sample, 1024, None)
                kwargs = dict(strides=2, activation=tf.nn.relu)
                hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1]])
                # 1 x 1
                hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs)
                # 5 x 5 x 128
                hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs)
                # 13 x 13 x 64
                hidden = tf.layers.conv2d_transpose(hidden, 32, 6, **kwargs)
                # 30 x 30 x 32
                mean = 255 * tf.layers.conv2d_transpose(
                    hidden, 3, 6, strides=2, activation=tf.nn.sigmoid)
                # 64 x 64 x 3
                assert mean.shape[1:].as_list() == [64, 64, 3], mean.shape
            else:
                # decoder for gridworlds / structured observations
                hidden = state_sample
                d = self._hidden_layer_size
                for _ in range(4):
                    hidden = tf.layers.dense(hidden, d, tf.nn.relu)
                mean = tf.layers.dense(hidden, np.prod(self.data_shape), None)

            mean = tf.reshape(mean, [-1, timesteps] + list(self.data_shape))

            check_in(
                "observation_dist",
                observation_dist,
                ("gaussian", "laplace", "bernoulli", "multinomial"),
            )
            if observation_dist == "gaussian":
                dist = tfd.Normal(mean, self._obs_stddev)
            elif observation_dist == "laplace":
                dist = tfd.Laplace(mean, self._obs_stddev / np.sqrt(2))
            elif observation_dist == "bernoulli":
                dist = tfd.Bernoulli(probs=mean)
            else:
                mean = tf.reshape(mean, [-1, timesteps] +
                                  [np.prod(list(self.data_shape))])
                dist = tfd.Multinomial(total_count=1, probs=mean)
                reshape = tfp.bijectors.Reshape(
                    event_shape_out=list(self.data_shape))
                dist = reshape(dist)
                return dist

            dist = tfd.Independent(dist, len(self.data_shape))
            return dist
Пример #3
0
def check_parameters(args):
    """Check the parameters so that we fail fast."""
    inference_algorithm = args["inference_algorithm"]
    combination_algorithm = args["combination_algorithm"]
    measures = args["measures"]
    prior = args["prior"]
    inverse_dynamics_model_checkpoint = args[
        "inverse_dynamics_model_checkpoint"]

    check_in(
        "inference_algorithm",
        inference_algorithm,
        [
            "rlsp",
            "latent_rlsp",
            "latent_rlsp_ablation",
            "sampling",
            "deviation",
            "reachability",
            "spec",
        ],
    )
    check_in(
        "combination_algorithm",
        combination_algorithm,
        ("additive", "bayesian", "latent_vi", "latent_ppo"),
    )
    check_in("prior", prior, ["gaussian", "laplace", "uniform"])

    for i, measure in enumerate(measures):
        check_in(
            "measure {}".format(i),
            measure,
            [
                "inferred_reward", "true_reward", "final_reward",
                "model_training_error"
            ],
        )

    if combination_algorithm == "bayesian":
        check_in("inference_algorithm", inference_algorithm,
                 ["rlsp", "sampling"])

    if inference_algorithm == "latent_rlsp":
        check_not_none("inverse_dynamics_model_checkpoint",
                       inverse_dynamics_model_checkpoint)

    if (combination_algorithm.startswith("latent")
            and inference_algorithm != "latent_rlsp"):
        raise ValueError(
            "combination_algorithm 'latent' should only be used with 'latent_rlsp'"
        )
Пример #4
0
def get_problem_parameters(env_name, problem_name):
    check_in("env_name", env_name, TOY_ENV_CLASSES)
    check_in("problem_name", problem_name, TOY_PROBLEMS[env_name])
    spec, cur_state, r_task, r_true = TOY_PROBLEMS[env_name][problem_name]
    env = TOY_ENV_CLASSES[env_name](spec)
    return env, env.get_num_from_state(cur_state), r_task, r_true
Пример #5
0
def rlsp(
    _run,
    mdp,
    s_current,
    p_0,
    horizon,
    temp=1,
    epochs=1,
    learning_rate=0.2,
    r_prior=None,
    r_vec=None,
    threshold=1e-3,
    check_grad_flag=False,
    solver="value_iter",
    reset_solver=False,
    solver_iterations=1000,
):
    """The RLSP algorithm."""
    check_in("solver", solver, ("value_iter", "ppo"))

    def compute_grad(r_vec):
        # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s)
        if solver == "value_iter":
            policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp)
        elif solver == "ppo":
            policy = ppo.learn(r_vec,
                               reset_model=reset_solver,
                               total_timesteps=solver_iterations)

        _run.log_scalar(
            "policy_eval_r_vec",
            evaluate_policy(
                mdp,
                policy,
                mdp.get_num_from_state(mdp.init_state),
                1,
                mdp.f_matrix @ r_vec,
                horizon,
            ),
            i,
        )

        d_last_step, d_last_step_list = compute_d_last_step(mdp,
                                                            policy,
                                                            p_0,
                                                            horizon,
                                                            return_all=True)
        if d_last_step[s_current] == 0:
            print("Error in om_method: No feasible trajectories!")
            return r_vec

        expected_features, expected_features_list = compute_feature_expectations(
            mdp, policy, p_0, horizon)

        G = compute_g(mdp, policy, p_0, horizon, d_last_step_list,
                      expected_features_list)
        # Compute the gradient
        dL_dr_vec = G[s_current] / d_last_step[s_current]
        # Gradient of the prior
        if r_prior is not None:
            dL_dr_vec += r_prior.logdistr_grad(r_vec)
        return dL_dr_vec

    def compute_log_likelihood(r_vec):
        policy = value_iter(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp)
        d_last_step = compute_d_last_step(mdp, policy, p_0, horizon)
        log_likelihood = np.log(d_last_step[s_current])
        if r_prior is not None:
            log_likelihood += np.sum(r_prior.logpdf(r_vec))
        return log_likelihood

    def get_grad(_):
        """dummy function for use with check_grad()"""
        return dL_dr_vec

    if r_vec is None:
        r_vec = 0.01 * np.random.randn(mdp.f_matrix.shape[1])
    print("Initial reward vector: {}".format(r_vec))

    ppo = PPOSolver(mdp, temp)

    if check_grad_flag:
        grad_error_list = []

    for i in range(epochs):
        dL_dr_vec = compute_grad(r_vec)
        if check_grad_flag:
            grad_error_list.append(
                check_grad(compute_log_likelihood, get_grad, r_vec))

        # Gradient ascent
        r_vec = r_vec + learning_rate * dL_dr_vec

        grad_norm = np.linalg.norm(dL_dr_vec)

        with np.printoptions(precision=4, suppress=True):
            print("Epoch {}; Reward vector: {}; grad norm: {}".format(
                i, r_vec, grad_norm))
            if check_grad_flag:
                print("grad error: {}".format(grad_error_list[-1]))

        if grad_norm < threshold:
            if check_grad_flag:
                print()
                print("Max grad error: {}".format(
                    np.amax(np.asarray(grad_error_list))))
                print("Median grad error: {}".format(
                    np.median(np.asarray(grad_error_list))))
            break

    return r_vec
Пример #6
0
def main(
    _run,
    env_id,
    prior,
    horizon,
    policy_horizon_factor,
    learning_rate,
    epochs,
    std,
    print_level,
    n_trajectories,
    solver_iterations,
    reset_solver,
    latent_model_checkpoint,
    inverse_dynamics_model_checkpoint,
    n_trajectories_forward_factor,
    trajectory_video_path,
    current_state_from,
    good_policy_path,
    current_state_file,
    continue_training_dynamics,
    continue_training_latent_space,
    debug_train_with_true_dynamics,
    debug_handcoded_features,
    vae_latent_space,
    identity_latent_space,
    clip_mujoco_obs,
    horizon_curriculum,
    n_eval_rollouts,
    inverse_model_parameters,
    latent_space_model_parameters,
    inverse_policy_parameters,
    experience_replay_size,
    n_sample_states,
    n_rollouts_init,
    add_policy_rollouts_to_replay,
    reweight_gradient,
    threshold,
    max_epochs_per_horizon,
    init_from_policy,
    reward_action_norm_factor,
    seed,
):
    print("--------")
    for key, val in locals().items():
        print(key, val)
    print("--------")

    # Check the parameters so that we fail fast
    check_in(
        "env_id",
        env_id,
        [
            "InvertedPendulum-v2",
            "HalfCheetah-v2",
            "HalfCheetah-FW-v2",
            "HalfCheetah-BW-v2",
            "Ant-v2",
            "Ant-FW-v2",
            "Hopper-v2",
            "Hopper-FW-v2",
            "FetchReachStack-v1",
        ],
    )
    if prior is not None:
        check_in("prior", prior, ["gaussian", "laplace", "uniform"])
    check_in(
        "current_state_from",
        current_state_from,
        ["optimal", "initial", "trajectory_file", "file"],
    )
    assert sum([debug_handcoded_features, vae_latent_space, identity_latent_space]) <= 1
    if current_state_from == "optimal":
        check_not_none("good_policy_path", good_policy_path)
    if not (debug_handcoded_features or vae_latent_space):
        check_not_none("latent_model_checkpoint", latent_model_checkpoint)

    np.random.seed(seed)
    tf.random.set_random_seed(seed)

    env = gym.make(env_id)

    if good_policy_path is None:
        true_reward_policy = None
    else:
        true_reward_policy = SAC.load(good_policy_path)

    random_policy = SAC(MlpPolicySac, env, verbose=0)

    # Sample input states
    if current_state_from == "file":
        with open(current_state_file, "rb") as f:
            current_states = list(pickle.load(f))
    elif current_state_from == "trajectory_file":
        rollout = load_data(current_state_file)["observations"][0]
        current_states = sample_obs_from_trajectory(rollout, n_sample_states)
    else:
        current_states = []
        for _ in range(n_sample_states):
            if current_state_from == "initial":
                state = env.reset()
            elif current_state_from == "optimal":
                observations, _, total_reward = get_trajectory(
                    env, true_reward_policy, True, False, True
                )
                print("Sample policy return:", total_reward)
                state = sample_obs_from_trajectory(observations, 1)[0]
            else:
                raise NotImplementedError()
            current_states.append(state)

    _run.info["current_states"] = current_states

    experience_replay = ExperienceReplay(experience_replay_size)
    if add_policy_rollouts_to_replay:
        experience_replay.add_random_rollouts(
            env, env.spec.max_episode_steps, int(0.25 * n_rollouts_init)
        )
        experience_replay.add_policy_rollouts(
            env,
            true_reward_policy,
            int(0.25 * n_rollouts_init),
            env.spec.max_episode_steps,
            eps_greedy=0,
        )
        experience_replay.add_policy_rollouts(
            env,
            true_reward_policy,
            int(0.25 * n_rollouts_init),
            env.spec.max_episode_steps,
            eps_greedy=0.12,
        )
        experience_replay.add_policy_rollouts(
            env,
            true_reward_policy,
            int(0.25 * n_rollouts_init),
            env.spec.max_episode_steps,
            eps_greedy=0.3,
        )
    else:
        experience_replay.add_random_rollouts(
            env, env.spec.max_episode_steps, n_rollouts_init
        )

    # Train / load transition models
    graph_latent, graph_bwd = tf.Graph(), tf.Graph()
    os.makedirs("tf_ckpt", exist_ok=True)

    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    if debug_handcoded_features:
        latent_space = MujocoDebugFeatures(env)
    elif identity_latent_space:
        latent_space = IdentityFeatures(env)
    elif vae_latent_space:
        with graph_latent.as_default():
            if latent_model_checkpoint is not None:
                latent_space = StateVAE.restore(latent_model_checkpoint)
                latent_space.checkpoint_folder = None  # don't continue saving model
            else:
                label = "vae_{}_{}".format(env_id, timestamp)
                latent_model_checkpoint = "tf_ckpt/tf_ckpt_" + label

                latent_space = StateVAE(
                    env.observation_space.shape[0],
                    checkpoint_folder=latent_model_checkpoint,
                    **latent_space_model_parameters["model"],
                )
                initial_loss, final_loss = latent_space.learn(
                    experience_replay,
                    return_initial_loss=True,
                    verbose=True,
                    **latent_space_model_parameters["learn"],
                )
                print(
                    "Initialized latent space VAE: {} --> {}".format(
                        initial_loss, final_loss
                    )
                )
    else:
        if latent_model_checkpoint is not None:
            with graph_latent.as_default():
                latent_space = LatentSpaceModel.restore(env, latent_model_checkpoint)
            latent_space.checkpoint_folder = None  # don't continue saving model
        else:
            raise NotImplementedError()

    with graph_bwd.as_default():
        if debug_train_with_true_dynamics:
            inverse_transition_model = PendulumDynamics(latent_space, backward=True)
        elif inverse_dynamics_model_checkpoint is not None:
            inverse_transition_model = InverseDynamicsMLP.restore(
                env, experience_replay, inverse_dynamics_model_checkpoint
            )
            inverse_transition_model.checkpoint_folder = None
        else:
            label = "mlp_{}_{}".format(env_id, timestamp)
            inverse_dynamics_model_checkpoint = "tf_ckpt/tf_ckpt_" + label

            inverse_transition_model = InverseDynamicsMLP(
                env,
                experience_replay,
                checkpoint_folder=inverse_dynamics_model_checkpoint,
                **inverse_model_parameters["model"],
            )
            initial_loss, final_loss = inverse_transition_model.learn(
                return_initial_loss=True,
                verbose=True,
                **inverse_model_parameters["learn"],
            )
            print(
                "Initialized backward model: {} --> {}".format(initial_loss, final_loss)
            )

    _run.info["inverse_dynamics_model_checkpoint"] = inverse_dynamics_model_checkpoint
    _run.info["latent_model_checkpoint"] = latent_model_checkpoint

    tf_graphs = {"latent": graph_latent, "inverse": graph_bwd}

    reward_center = np.zeros(latent_space.state_size)
    r_prior = get_r_prior(prior, reward_center, std)

    last_n = 5
    feature_counts_forward_last_n = None
    feature_counts_backward_last_n = None
    inferred_reward_last_n = None

    _run.info["inferred_rewards"] = []

    def log_metrics(loc, glob):
        del glob
        global feature_counts_forward_last_n
        global feature_counts_backward_last_n
        global inferred_reward_last_n
        step = loc["epoch"]
        _run.log_scalar("inverse_policy_error", loc["inverse_policy_final_loss"], step)
        _run.log_scalar(
            "inverse_policy_initial_loss", loc["inverse_policy_initial_loss"], step
        )
        _run.log_scalar("grad_norm", loc["grad_norm"], step)
        _run.log_scalar("last_n_grad_norm", loc["last_n_grad_norm"], step)

        # feature counts
        feature_counts_forward = loc["feature_counts_forward"]
        feature_counts_backward = loc["feature_counts_backward"]

        r_inferred = loc["r_vec"]
        _run.info["inferred_rewards"].append(r_inferred)

        if step == 1:
            feature_counts_forward_last_n = [feature_counts_forward] * last_n
            feature_counts_backward_last_n = [feature_counts_backward] * last_n
            inferred_reward_last_n = [r_inferred] * last_n

        # magnitude
        fw_mag = np.linalg.norm(feature_counts_forward)
        bw_mag = np.linalg.norm(feature_counts_backward)
        rew_mag = np.linalg.norm(r_inferred)
        _run.log_scalar("feature_counts_forward_magnitude", fw_mag, step)
        _run.log_scalar("feature_counts_backward_magnitude", bw_mag, step)
        _run.log_scalar("inferred_reward_magnitude", rew_mag, step)
        # direction
        fw_cos_last = get_cosine_similarity(
            feature_counts_forward, feature_counts_forward_last_n[-1]
        )
        bw_cos_last = get_cosine_similarity(
            feature_counts_backward, feature_counts_backward_last_n[-1]
        )
        rew_cos_last = get_cosine_similarity(r_inferred, inferred_reward_last_n[-1])
        _run.log_scalar("feature_counts_forward_cos_last", fw_cos_last, step)
        _run.log_scalar("feature_counts_backward_cos_last", bw_cos_last, step)
        _run.log_scalar("inferred_reward_cos_last", rew_cos_last, step)

        fw_cos_last_n = get_cosine_similarity(
            feature_counts_forward, feature_counts_forward_last_n[0]
        )
        bw_cos_last_n = get_cosine_similarity(
            feature_counts_backward, feature_counts_backward_last_n[0]
        )
        rew_cos_last_n = get_cosine_similarity(r_inferred, inferred_reward_last_n[0])
        _run.log_scalar(
            "feature_counts_forward_cos_last_{}".format(last_n), fw_cos_last_n, step
        )
        _run.log_scalar(
            "feature_counts_backward_cos_last_{}".format(last_n), bw_cos_last_n, step
        )
        _run.log_scalar(
            "inferred_reward_cos_last_{}".format(last_n), rew_cos_last_n, step
        )

        _run.log_scalar("horizon", loc["horizon"], step)
        _run.log_scalar("threshold", loc["threshold"], step)

        feature_counts_forward_last_n.append(feature_counts_forward)
        feature_counts_backward_last_n.append(feature_counts_backward)
        inferred_reward_last_n.append(r_inferred)
        feature_counts_forward_last_n.pop(0)
        feature_counts_backward_last_n.pop(0)
        inferred_reward_last_n.pop(0)

        r_inferred_normalized = r_inferred / np.linalg.norm(r_inferred)

        env_inferred = LatentSpaceRewardWrapper(
            env, latent_space, r_inferred_normalized
        )
        if true_reward_policy is not None:
            good_policy_true_reward_obtained = evaluate_policy(
                env, true_reward_policy, n_eval_rollouts
            )
            good_policy_inferred_reward_obtained = evaluate_policy(
                env_inferred, true_reward_policy, n_eval_rollouts
            )

            _run.log_scalar(
                "good_policy_true_reward_obtained",
                good_policy_true_reward_obtained,
                step,
            )
            _run.log_scalar(
                "good_policy_inferred_reward_obtained",
                good_policy_inferred_reward_obtained,
                step,
            )

        random_policy_true_reward_obtained = evaluate_policy(
            env, random_policy, n_eval_rollouts
        )
        random_policy_inferred_reward_obtained = evaluate_policy(
            env_inferred, random_policy, n_eval_rollouts
        )

        _run.log_scalar(
            "random_policy_true_reward_obtained",
            random_policy_true_reward_obtained,
            step,
        )
        _run.log_scalar(
            "random_policy_inferred_reward_obtained",
            random_policy_inferred_reward_obtained,
            step,
        )

        rlsp_policy = loc["solver"]
        with Artifact(f"rlsp_policy_{step}.zip", None, _run) as f:
            rlsp_policy.save(f)

        rlsp_policy_true_reward_obtained = evaluate_policy(
            env, rlsp_policy, n_eval_rollouts
        )
        rlsp_policy_inferred_reward_obtained = evaluate_policy(
            env_inferred, rlsp_policy, n_eval_rollouts
        )

        _run.log_scalar(
            "rlsp_policy_true_reward_obtained", rlsp_policy_true_reward_obtained, step,
        )
        _run.log_scalar(
            "rlsp_policy_inferred_reward_obtained",
            rlsp_policy_inferred_reward_obtained,
            step,
        )

        if true_reward_policy is not None:
            print("True reward policy: true return", good_policy_true_reward_obtained)
            print(
                "True reward policy: inferred return",
                good_policy_inferred_reward_obtained,
            )
        print("RLSP policy: true return", rlsp_policy_true_reward_obtained)
        print("RLSP policy: inferred return", rlsp_policy_inferred_reward_obtained)
        print("Random policy: true return", random_policy_true_reward_obtained)
        print("Random policy: inferred return", random_policy_inferred_reward_obtained)

        _run.log_scalar("latent_initial_loss", loc["latent_initial_loss"], step)
        _run.log_scalar("latent_final_loss", loc["latent_final_loss"], step)
        _run.log_scalar("backward_initial_loss", loc["backward_initial_loss"], step)
        _run.log_scalar("backward_final_loss", loc["backward_final_loss"], step)

    r_inferred = latent_rlsp(
        _run,
        env,
        current_states,
        horizon,
        experience_replay,
        latent_space,
        inverse_transition_model,
        policy_horizon_factor=policy_horizon_factor,
        epochs=epochs,
        learning_rate=learning_rate,
        r_prior=r_prior,
        threshold=threshold,
        n_trajectories=n_trajectories,
        reset_solver=reset_solver,
        solver_iterations=solver_iterations,
        print_level=print_level,
        n_trajectories_forward_factor=n_trajectories_forward_factor,
        callback=log_metrics,
        trajectory_video_path=trajectory_video_path,
        continue_training_dynamics=continue_training_dynamics,
        continue_training_latent_space=continue_training_latent_space,
        tf_graphs=tf_graphs,
        clip_mujoco_obs=clip_mujoco_obs,
        horizon_curriculum=horizon_curriculum,
        inverse_model_parameters=inverse_model_parameters,
        latent_space_model_parameters=latent_space_model_parameters,
        inverse_policy_parameters=inverse_policy_parameters,
        reweight_gradient=reweight_gradient,
        max_epochs_per_horizon=max_epochs_per_horizon,
        init_from_policy=init_from_policy,
        reward_action_norm_factor=reward_action_norm_factor,
    )