def legacy_make_test_rewards(
    n_questions: int,
    n_rewards: int,
    true_reward: np.ndarray,
    epsilons: List[float],
    use_equiv: bool,
) -> Dict[float, Tuple[np.ndarray, np.ndarray]]:
    """ Generates n_rewards reward vectors and determines which are aligned. """
    assert n_rewards > 0
    assert_reward(true_reward, use_equiv)

    trajs = make_random_questions(n_questions, Driver())
    _, normals = make_normals(trajs, Driver(), use_equiv)
    gt_pref = true_reward @ normals.T > 0
    normals = orient_normals(normals, gt_pref, use_equiv)
    assert_normals(normals, use_equiv)

    n_reward_features = normals.shape[1]

    test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = {}

    for epsilon in epsilons:
        assert epsilon >= 0.0

        cov = 1.0

        rewards = make_gaussian_rewards(n_rewards,
                                        use_equiv,
                                        mean=true_reward,
                                        cov=cov)
        normals = normals[true_reward @ normals.T > epsilon]
        ground_truth_alignment = cast(np.ndarray,
                                      np.all(rewards @ normals.T > 0, axis=1))
        mean_agree = np.mean(ground_truth_alignment)

        while mean_agree > 0.55 or mean_agree < 0.45:
            if mean_agree > 0.55:
                cov *= 1.1
            else:
                cov /= 1.1
            if not np.isfinite(cov) or cov <= 0.0 or cov >= 100.0:
                # TODO(joschnei): Break is a code smell
                logging.warning(
                    f"cov={cov}, using last good batch of rewards.")
                break
            rewards = make_gaussian_rewards(n_rewards,
                                            use_equiv,
                                            mean=true_reward,
                                            cov=cov)
            normals = normals[true_reward @ normals.T > epsilon]
            ground_truth_alignment = cast(
                np.ndarray, np.all(rewards @ normals.T > 0, axis=1))
            mean_agree = np.mean(ground_truth_alignment)

        assert ground_truth_alignment.shape == (n_rewards, )
        assert rewards.shape == (n_rewards, n_reward_features)

        test_rewards[epsilon] = (rewards, ground_truth_alignment)

    return test_rewards
Пример #2
0
def main() -> None:
    reward_weights = np.ones(4)
    sim = Driver()
    env = LegacyEnv(reward_weights)

    plans = make_actions(1000)

    returns = []
    start = perf_counter()
    for plan in plans:
        sim.feed(plan)
        features = sim.get_features()
        returns.append(reward_weights @ features)
    stop = perf_counter()
    print(f"Legacy env took {(stop - start) / len(plans)} seconds on average")
    # Driver env is a lot faster for rollouts

    returns = []
    start = perf_counter()
    for plan in plans:
        env.reset()
        plan_return = 0.0
        for action in plan:
            _, reward, _, _ = env.step(action)
            plan_return += reward
        returns.append(plan_return)
    stop = perf_counter()
    print(f"tf env took {(stop - start) / len(plans)} seconds on average")
def play(sim: Driver, optimal_ctrl):
    """ Renders trajectory for user. """
    sim.set_ctrl(optimal_ctrl)
    keep_playing = "y"
    while keep_playing == "y":
        keep_playing = "u"
        sim.watch(1)
        while keep_playing != "n" and keep_playing != "y":
            keep_playing = input("Again? [y/n]: ").lower()
    return optimal_ctrl
def test_get_simulated_feedback(actions: np.ndarray, reward: np.ndarray):
    feature_1, feature_2, pref = get_simulated_feedback(
        Driver(), actions[0], actions[1], "strict", reward
    )

    expected_pref = (reward @ (feature_1 - feature_2) > 0) * 2 - 1

    assert expected_pref == pref
def test_orient_normals(actions: np.ndarray, reward: np.ndarray):
    reward = safe_normalize(reward)

    _, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False)
    value_diffs = reward @ normals.T
    prefs = value_diffs > 0

    oriented_normals = orient_normals(normals, preferences=prefs)
    assert_normals(oriented_normals)
    assert np.all(reward @ oriented_normals.T == np.abs(value_diffs))
def make_normals(inputs: np.ndarray, sim: Driver,
                 use_equiv: bool) -> Tuple[np.ndarray, np.ndarray]:
    """Converts pairs of car inputs to trajectory preference normal vectors.

    Args:
        inputs (np.ndarray): (n, 2, T, 2) array of pairs of 2-dimension actions for T timesteps
        sim (Driver): Driving simulation to get features from
        use_equiv (bool): Allow equivalent preferences?

    Returns:
        Tuple[np.ndarray, np.ndarray]: input features and normal vectors
    """
    if len(inputs.shape) == 3:
        shape_compat(inputs, (-1, 2, -1))
    elif len(inputs.shape) == 4:
        shape_compat(inputs, (-1, 2, -1, 2))

    normals = np.empty(shape=(inputs.shape[0], sim.num_of_features))
    input_features = np.empty(shape=(inputs.shape[0], 2, sim.num_of_features))
    for i, (input_a, input_b) in enumerate(inputs):
        sim.feed(input_a)
        phi_a = np.array(sim.get_features())

        sim.feed(input_b)
        phi_b = np.array(sim.get_features())

        input_features[i] = np.stack((phi_a, phi_b))

        normals[i] = phi_a - phi_b
    assert_normals(normals, use_equiv)
    return input_features, normals
def get_simulated_feedback(
    simulation: Driver,
    input_A: np.ndarray,
    input_B: np.ndarray,
    query_type: str,
    true_reward: np.ndarray,
    delta: Optional[float] = None,
) -> Tuple[np.ndarray, np.ndarray, int]:
    """ Gets preference between trajectories from an agent simulated by true_reward """
    simulation.feed(input_A)
    phi_A = np.array(simulation.get_features())
    simulation.feed(input_B)
    phi_B = np.array(simulation.get_features())
    if query_type == "weak":
        # TODO(joschnei): Implement weak errors using delta. I think there's a model for this but I can't remember off hand.
        raise NotImplementedError(
            "Simulated weak preferences not implemented.")
        if delta is None:
            raise ValueError("Must provide delta when using weak queries.")
    elif query_type == "strict":
        s = 1 if true_reward @ (phi_A - phi_B) > 0 else -1
    else:
        raise ValueError(
            f'query type {query_type} must be either "strict" or "weak"')
    return phi_A, phi_B, s
def main(datadir: Path) -> None:
    logging.basicConfig(level="INFO")

    datadir = Path(datadir)

    flags = pickle.load(open(datadir / "flags.pkl", "rb"))
    use_equiv = False
    sim = Driver()
    n_reward_features = sim.num_of_features

    inputs = np.load(datadir / "inputs.npy")
    n_questions = inputs.shape[0]
    assert inputs.shape[1] == 2

    input_features = np.load(datadir / "input_features.npy")
    n_questions = input_features.shape[0]
    assert input_features.shape == (n_questions, 2, n_reward_features), input_features.shape

    assert_input_feature_consistency(inputs, input_features, sim)

    normals = np.load(datadir / "normals.npy")
    logging.info(f"There are {normals.shape[0]} questions")
    assert_normals(normals, use_equiv, n_reward_features)

    assert_normal_consistency(input_features, normals)

    preferences = np.load(datadir / "preferences.npy")
    assert preferences.shape == (n_questions,)
    assert np.all((preferences == 1) | (preferences == -1))

    oriented_normals = orient_normals(normals, preferences)

    if (datadir / "true_reward.npy").exists():
        true_reward = np.load(datadir / "true_reward.npy")
        assert_reward(true_reward, use_equiv, n_reward_features)
        logging.info(f"true_reward={true_reward}")
        assert_true_reward_consistency(oriented_normals, true_reward)

    if (datadir / "mean_reward.npy").exists():
        mean_reward = np.load(datadir / "mean_reward.npy")
        logging.info(f"mean_reward={mean_reward}")
        assert_reward(mean_reward, use_equiv, n_reward_features)

        mean_accuracy = np.mean(oriented_normals @ mean_reward > 0)
        logging.info(f"Accuracy of mean reward function is {mean_accuracy}")
def make_gt_test_align(
    test_rewards: np.ndarray,
    n_questions: int,
    true_reward: np.ndarray,
    epsilon: float,
    use_equiv: bool = False,
) -> np.ndarray:
    env = Driver()
    trajs = make_random_questions(n_questions, env)
    _, normals = make_normals(trajs, env, use_equiv)

    value_diff = true_reward @ normals.T
    eps_questions = np.abs(value_diff) > epsilon
    normals = normals[eps_questions]

    gt_pref = value_diff[eps_questions] > 0
    normals = orient_normals(normals, gt_pref, use_equiv)

    alignment = cast(np.ndarray, np.all(test_rewards @ normals.T > 0, axis=1))
    assert alignment.shape == (
        test_rewards.shape[0],
    ), f"alignment shape={alignment.shape} is not expected {test_rewards.shape[0]}"
    return alignment
def human(
    epsilons: List[float] = [0.0],
    deltas: List[float] = [0.05],
    n_rewards: int = 10000,
    human_samples: List[int] = [1],
    n_model_samples: int = 1000,
    input_features_name: Path = Path("input_features.npy"),
    normals_name: Path = Path("normals.npy"),
    preferences_name: Path = Path("preferences.npy"),
    flags_name: Path = Path("flags.pkl"),
    datadir: Path = Path("questions"),
    outdir: Path = Path("questions"),
    rewards_path: Optional[Path] = None,
    use_mean_reward: bool = False,
    skip_remove_duplicates: bool = False,
    skip_epsilon_filtering: bool = False,
    skip_redundancy_filtering: bool = False,
    n_cpus: int = 1,
    overwrite: bool = False,
):
    """ Evaluates alignment test elicited from a human. """
    outdir.mkdir(parents=True, exist_ok=True)

    parallel = Parallel(n_jobs=n_cpus)

    flags = pkl.load(open(datadir / flags_name, "rb"))
    query_type = flags["query_type"]
    equiv_probability = flags["equiv_size"]

    sim = Driver()
    n_reward_features = sim.num_of_features

    elicited_normals, elicited_preferences, elicited_input_features = load_elicitation(
        datadir=datadir,
        normals_name=normals_name,
        preferences_name=preferences_name,
        input_features_name=input_features_name,
        n_reward_features=n_reward_features,
        use_equiv=use_equiv,
        query_type=query_type,
        equiv_probability=equiv_probability,
    )
    assert elicited_preferences.shape[0] > 0

    factory = TestFactory(
        query_type=query_type,
        reward_dimension=elicited_normals.shape[1],
        equiv_probability=equiv_probability,
        n_reward_samples=n_model_samples,
        use_mean_reward=use_mean_reward,
        skip_dedup=skip_remove_duplicates,
        skip_noise_filtering=True,
        skip_epsilon_filtering=skip_epsilon_filtering,
        skip_redundancy_filtering=skip_redundancy_filtering,
    )

    test_path = outdir / make_outname(
        skip_remove_duplicates,
        True,
        skip_epsilon_filtering,
        skip_redundancy_filtering,
        base="indices",
    )
    test_results_path = outdir / make_outname(
        skip_remove_duplicates,
        True,
        skip_epsilon_filtering,
        skip_redundancy_filtering,
        base="test_results",
    )

    minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite)
    results: Dict[Experiment, np.ndarray] = load(test_results_path, overwrite)

    test_rewards = (np.load(open(rewards_path, "rb"))
                    if rewards_path is not None else make_gaussian_rewards(
                        n_rewards, use_equiv))
    np.save(outdir / "test_rewards.npy", test_rewards)

    experiments = make_experiments(epsilons,
                                   deltas,
                                   human_samples,
                                   overwrite,
                                   experiments=set(minimal_tests.keys()))

    for indices, result, experiment in parallel(
            delayed(run_human_experiment)(
                test_rewards,
                elicited_normals,
                elicited_input_features,
                elicited_preferences,
                epsilon,
                delta,
                n,
                factory,
                use_equiv,
            ) for epsilon, delta, n in experiments):
        minimal_tests[experiment] = indices
        results[experiment] = result

    pkl.dump(minimal_tests, open(test_path, "wb"))
    pkl.dump(results, open(test_results_path, "wb"))
def simulated(
    epsilons: List[float] = [0.0],
    n_rewards: int = 100,
    human_samples: List[int] = [1],
    n_reward_samples: int = 1000,
    n_test_states: Optional[int] = None,
    n_gt_test_questions: int = 10000,
    traj_opt: bool = False,
    datadir: Path = Path(),
    outdir: Path = Path(),
    deltas: List[Optional[float]] = [None],
    use_mean_reward: bool = False,
    use_random_test_questions: bool = False,
    n_random_test_questions: Optional[int] = None,
    use_cheating_questions: bool = False,
    skip_remove_duplicates: bool = False,
    skip_epsilon_filtering: bool = False,
    skip_redundancy_filtering: bool = False,
    use_true_epsilon: bool = False,
    legacy_test_rewards: bool = False,
    replications: Optional[Union[str, Tuple[int, ...]]] = None,
    n_cpus: int = 1,
    overwrite_test_rewards: bool = False,
    overwrite_results: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
) -> None:
    """ Evaluates alignment test generated by ground-truth rewards. """
    logging.basicConfig(level=verbosity,
                        format="%(levelname)s:%(asctime)s:%(message)s")

    if replications is not None:
        replication_indices = parse_replications(replications)

        for replication in replication_indices:
            if not (datadir / str(replication)).exists():
                logging.warning(
                    f"Replication {replication} does not exist, skipping")
                continue

            logging.info(f"Starting replication {replication}")

            simulated(
                epsilons=epsilons,
                deltas=deltas,
                n_rewards=n_rewards,
                human_samples=human_samples,
                n_reward_samples=n_reward_samples,
                n_test_states=n_test_states,
                n_gt_test_questions=n_gt_test_questions,
                datadir=datadir / str(replication),
                outdir=outdir / str(replication),
                use_mean_reward=use_mean_reward,
                use_random_test_questions=use_random_test_questions,
                use_cheating_questions=use_cheating_questions,
                n_random_test_questions=n_random_test_questions,
                skip_remove_duplicates=skip_remove_duplicates,
                skip_epsilon_filtering=skip_epsilon_filtering,
                skip_redundancy_filtering=skip_redundancy_filtering,
                use_true_epsilon=use_true_epsilon,
                legacy_test_rewards=legacy_test_rewards,
                n_cpus=n_cpus,
                overwrite_test_rewards=overwrite_test_rewards,
                overwrite_results=overwrite_results,
                verbosity=verbosity,
            )
        exit()

    logging.info(f"Using {n_cpus} cpus.")
    parallel = Parallel(n_jobs=n_cpus)

    outdir.mkdir(parents=True, exist_ok=True)

    if n_random_test_questions is not None:
        # Argh defaults to parsing something as a string if its optional
        n_random_test_questions = int(n_random_test_questions)

    flags = pkl.load(open(datadir / flags_name, "rb"))
    query_type = flags["query_type"]
    equiv_probability = flags["equiv_size"]

    env = Driver()
    n_reward_features = env.num_of_features

    logging.info("Loading elicitation results")
    elicited_normals, elicited_preferences, elicited_input_features = load_elicitation(
        datadir=datadir,
        normals_name=normals_name,
        preferences_name=preferences_name,
        input_features_name=input_features_name,
        n_reward_features=n_reward_features,
        use_equiv=use_equiv,
        query_type=query_type,
        equiv_probability=equiv_probability,
    )
    true_reward = np.load(datadir / true_reward_name)
    assert_reward(true_reward, False, n_reward_features)

    if use_equiv:
        true_reward = np.append(true_reward, [1])
    else:
        assert not np.any(elicited_preferences == 0)

    factory = TestFactory(
        query_type=query_type,
        reward_dimension=elicited_normals.shape[1],
        equiv_probability=equiv_probability,
        n_reward_samples=n_reward_samples,
        use_mean_reward=use_mean_reward,
        skip_dedup=skip_remove_duplicates,
        skip_noise_filtering=True,
        skip_epsilon_filtering=skip_epsilon_filtering,
        skip_redundancy_filtering=skip_redundancy_filtering,
        use_true_epsilon=use_true_epsilon,
        true_reward=true_reward,
    )
    logging.info(f"""Filtering settings:
    # reward samples={n_reward_samples},
    use mean reward={use_mean_reward},
    skip duplicates={skip_remove_duplicates}
    skip noise={True}
    skip epsilon={skip_epsilon_filtering}
    skip redundancy={skip_redundancy_filtering}
    use true epsilon={use_true_epsilon}
    """)

    confusion_path, test_path = make_outnames(
        outdir,
        skip_remove_duplicates,
        True,
        skip_epsilon_filtering,
        skip_redundancy_filtering,
    )
    confusions: Dict[Experiment, np.ndarray] = load(confusion_path,
                                                    overwrite_results,
                                                    default={})
    minimal_tests: Dict[Experiment, np.ndarray] = load(test_path,
                                                       overwrite_results,
                                                       default={})

    experiments = make_experiments(epsilons,
                                   deltas,
                                   human_samples,
                                   overwrite_results,
                                   experiments=set(minimal_tests.keys()))

    if use_random_test_questions:
        logging.info("Making random test")
        logging.info(f"True reward: {true_reward}")
        normals, preferences, input_features = make_random_test(
            n_random_test_questions,
            elicited_input_features,
            elicited_preferences,
            reward_iterations=flags["reward_iterations"],
            query_type=query_type,
            equiv_size=flags["equiv_size"],
            sim=env,
            use_equiv=use_equiv,
        )

        good_indices = (true_reward @ normals.T) > 0

        logging.info(
            f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward."
        )

        if use_cheating_questions:
            logging.info(f"Selecting only questions consistent with gt reward")
            normals = normals[good_indices]
            preferences = preferences[good_indices]
            input_features = input_features[good_indices]

        assert_normals(normals, use_equiv)
    else:
        max_n = max(human_samples)
        preferences = elicited_preferences[:max_n]
        input_features = elicited_input_features[:max_n]
        logging.debug(f"elicited_normals={elicited_normals[:10]}")
        normals = orient_normals(elicited_normals[:max_n], preferences,
                                 use_equiv, n_reward_features)
        logging.debug(f"normals={normals[:10]}")

        assert np.all(true_reward @ normals.T >= 0)

    if not legacy_test_rewards:
        test_rewards = make_test_rewards(
            epsilons=epsilons,
            true_reward=true_reward,
            n_rewards=n_rewards,
            n_test_states=n_test_states,
            n_gt_test_questions=int(n_gt_test_questions),
            traj_opt=traj_opt,
            outdir=outdir,
            parallel=parallel,
            use_equiv=use_equiv,
            overwrite=overwrite_test_rewards,
        )
    else:
        test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward,
                                                epsilons, use_equiv)

    for indices, confusion, experiment in parallel(
            delayed(run_gt_experiment)(
                normals=normals,
                test_rewards=test_rewards[epsilon][0],
                test_reward_alignment=test_rewards[epsilon][1],
                epsilon=epsilon,
                delta=delta,
                use_equiv=use_equiv,
                n_human_samples=n,
                factory=factory,
                input_features=input_features,
                preferences=preferences,
                outdir=outdir,
                verbosity=verbosity,
            ) for epsilon, delta, n in experiments):
        minimal_tests[experiment] = indices
        confusions[experiment] = confusion

    pkl.dump(confusions, open(confusion_path, "wb"))
    pkl.dump(minimal_tests, open(test_path, "wb"))
def simulated(
    outdir: Path,
    criterion: Literal["information", "volume", "random"],
    termination_threshold: float,
    n_reward_samples: int,
    query_type: Literal["strict", "weak"] = "strict",
    equiv_size: Optional[float] = None,
    true_reward_path: Optional[Path] = None,
    continuous: bool = False,
    overwrite: bool = False,
    replicaitons: Optional[str] = None,
):
    """ Generates a test by eliciting from a human simulated by a ground truth reward. """
    if replicaitons is not None:
        replication_indices = parse_replications(replicaitons)
        if true_reward_path is not None:
            reward_dir, reward_name = make_reward_path(true_reward_path)
            Parallel(n_jobs=-2)(
                delayed(simulated)(
                    outdir=Path(outdir) / str(i),
                    criterion=criterion,
                    termination_threshold=termination_threshold,
                    n_reward_smaples=n_reward_samples,
                    query_type=query_type,
                    equiv_size=equiv_size,
                    true_reward_path=reward_dir / str(i) / reward_name,
                    continuous=continuous,
                    overwrite=overwrite,
                )
                for i in replication_indices
            )
        else:
            Parallel(n_jobs=-2)(
                delayed(simulated)(
                    outdir=Path(outdir) / str(i),
                    criterion=criterion,
                    termination_threshold=termination_threshold,
                    n_reward_smaples=n_reward_samples,
                    query_type=query_type,
                    equiv_size=equiv_size,
                    continuous=continuous,
                    overwrite=overwrite,
                )
                for i in replication_indices
            )
        exit()

    criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size)

    env = Driver()
    d = env.num_of_features

    if true_reward_path is not None:
        logging.info(f"Loading true reward from {true_reward_path}")
        true_reward = np.load(true_reward_path)
    else:
        logging.info("Randomly generating true reward")
        true_reward = np.random.normal(size=(4,))
        true_reward = true_reward / np.linalg.norm(true_reward)
        np.save(outdir / "true_reward.npy", true_reward)

    pickle.dump(
        {
            "criterion": criterion,
            "reward_iterations": n_reward_samples,
            "stop_thresh": termination_threshold,
            "query_type": query_type,
            "equiv_size": equiv_size,
            "continuous": continuous,
        },
        open(outdir / "flags.pkl", "wb"),
    )

    normals = load(outdir / "normals.npy", overwrite=overwrite)
    preferences = load(outdir / "preferences.npy", overwrite=overwrite)
    inputs = load(outdir / "inputs.npy", overwrite=overwrite)
    input_features = load(outdir / "input_features.npy", overwrite=overwrite)

    # If there is already data, feed it to the w_sampler to get the right posterior.
    w_sampler = Sampler(d)
    if inputs is not None and input_features is not None and preferences is not None:
        for (a_phi, b_phi), preference in zip(input_features, preferences):
            w_sampler.feed(a_phi, b_phi, [preference])

    score = np.inf
    try:
        while score >= termination_threshold:
            w_samples, delta_samples = w_sampler.sample_given_delta(
                sample_count=n_reward_samples, query_type=query_type, delta=equiv_size
            )

            input_A, input_B, score = run_algo(criterion, env, w_samples, delta_samples, continuous)
            logging.info(f"Score={score}")

            if score > termination_threshold:
                inputs = update_inputs(
                    a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir
                )
                phi_A, phi_B, preference = get_simulated_feedback(
                    simulation=env,
                    input_A=input_A,
                    input_B=input_B,
                    query_type=query_type,
                    true_reward=true_reward,
                    delta=equiv_size,
                )
                input_features = append(input_features, np.stack([phi_A, phi_B]))
                normals = append(normals, phi_A - phi_B)
                preferences = append(preferences, preference)
                np.save(outdir / "input_features.npy", input_features)
                np.save(outdir / "normals.npy", normals)
                np.save(outdir / "preferences.npy", preferences)

                w_sampler.feed(phi_A, phi_B, [preference])
    except KeyboardInterrupt:
        # Pass through to finally
        logging.warning("\nSaving results, please do not exit again.")
    finally:
        save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def human(
    criterion: str,
    query_type: str,
    epsilon: float,
    n_reward_samples: int,
    equiv_size: float,
    outdir: Path = Path("questions"),
    continuous: bool = False,
    overwrite: bool = False,
):
    """ Generates a test by eliciting preferences from a human. """
    criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size)

    simulation_object = Driver()
    d = simulation_object.num_of_features

    pickle.dump(
        {
            "criterion": criterion,
            "query_type": query_type,
            "epsilon": epsilon,
            "reward_iterations": n_reward_samples,
            "delta": equiv_size,
            "continuous": continuous,
        },
        open(outdir / "flags.pkl", "wb"),
    )

    normals = load(outdir / "normals.npy", overwrite=overwrite)
    preferences = load(outdir / "preferences.npy", overwrite=overwrite)
    inputs = load(outdir / "inputs.npy", overwrite=overwrite)
    input_features = load(outdir / "input_features.npy", overwrite=overwrite)

    w_sampler = Sampler(d)
    if inputs is not None and input_features is not None and preferences is not None:
        for (a_phi, b_phi), preference in zip(input_features, preferences):
            w_sampler.feed(a_phi, b_phi, [preference])

    score = np.inf
    try:
        while score >= epsilon:
            w_samples, delta_samples = w_sampler.sample_given_delta(
                n_reward_samples, query_type, equiv_size
            )

            input_A, input_B, score = run_algo(
                criterion, simulation_object, w_samples, delta_samples, continuous
            )

            if score > epsilon:
                inputs = update_inputs(
                    a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir
                )
                phi_A, phi_B, preference = get_feedback(
                    simulation_object, input_A, input_B, query_type
                )
                input_features = append(input_features, np.stack([phi_A, phi_B]))
                normals = append(normals, phi_A - phi_B)
                preferences = append(preferences, preference)
                np.save(outdir / "input_features.npy", input_features)
                np.save(outdir / "normals.npy", normals)
                np.save(outdir / "preferences.npy", preferences)

                w_sampler.feed(phi_A, phi_B, [preference])
    except KeyboardInterrupt:
        # Pass through to finally
        logging.warning("\nSaving results, please do not exit again.")
    finally:
        save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def test_make_normals(actions: np.ndarray):
    features, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False)
    assert np.all((features[0][0] - features[0][1]) == normals)
    assert_normals(normals)
def main(
    n_questions: int,
    query_type: Literal["strict", "weak"] = "strict",
    equiv_size: float = 1.1,
    reward_iterations: int = 100,
    outdir: Path = Path("data/simulated/random/elicitation"),
    human: bool = False,
    reward_path: Optional[Path] = None,
    replications: Optional[str] = None,
    overwrite: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
) -> None:
    outpath = Path(outdir)
    outpath.mkdir(parents=True, exist_ok=True)
    setup_logging(verbosity=verbosity, log_path=outpath / "log.txt")

    if not human:
        assert reward_path is not None
        reward_dir, reward_name = make_reward_path(reward_path)
        reward_path = reward_dir / reward_name

    if replications is not None:
        replication_indices = parse_replications(replications)
        n_cpus = min(multiprocessing.cpu_count() - 4, len(replication_indices))
        Parallel(n_jobs=n_cpus)(
            delayed(main)(
                n_questions=n_questions,
                query_type=query_type,
                equiv_size=equiv_size,
                reward_iterations=reward_iterations,
                outdir=outpath / str(i),
                human=human,
                reward_path=reward_dir / str(i) / reward_name,
                overwrite=overwrite,
                verbosity=verbosity,
            )
            for i in replication_indices
        )
        exit()

    if not human:
        assert reward_path is not None
        if not reward_path.exists():
            logging.warning("Reward path given does not exist, generating random reward.")
            true_reward = np.random.default_rng().normal(loc=0, scale=1, size=(4,))
            true_reward = safe_normalize(true_reward)
            np.save(reward_path, true_reward)
        else:
            true_reward = np.load(reward_path)

    pickle.dump(
        {
            "n_questions": n_questions,
            "query_type": query_type,
            "equiv_size": equiv_size,
            "reward_iterations": reward_iterations,
            "human": human,
        },
        open(outpath / "flags.pkl", "wb"),
    )

    normals = load(outpath / "normals.npy", overwrite=overwrite)
    preferences = load(outpath / "preferences.npy", overwrite=overwrite)
    # TODO(joschnei): Make class for inputs, dimensions are too difficult to reason about
    # (N, 2, 100)
    inputs = load(outpath / "inputs.npy", overwrite=overwrite)
    input_features = load(outpath / "input_features.npy", overwrite=overwrite)

    env = Driver()

    if (
        inputs is not None
        and input_features is not None
        and inputs.shape[0] > input_features.shape[0]
    ):
        logging.info("Catching up to previously generated trajectories.")
        input_A, input_B = inputs[-1]

        if human:
            phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type)
        else:
            phi_A, phi_B, preference = get_simulated_feedback(
                env, input_A, input_B, query_type, true_reward, equiv_size
            )

        input_features, normals, preferences = update_response(
            input_features, normals, preferences, phi_A, phi_B, preference, outpath
        )

    # Questions and inputs are duplicated, but this keeps everything consistent for the hot-load case
    new_questions = n_questions - inputs.shape[0] if inputs is not None else n_questions
    questions = make_random_questions(n_questions=new_questions, env=env)
    logging.debug(f"questions={questions[:10]}")

    if inputs is not None:
        assert input_features is not None
        assert normals is not None
        assert preferences is not None
        assert inputs.shape[0] == input_features.shape[0]
        assert inputs.shape[0] == normals.shape[0]
        assert inputs.shape[0] == preferences.shape[0]

    for input_A, input_B in questions:
        inputs = update_inputs(input_A, input_B, inputs, outpath)

        if inputs.shape[0] % 10 == 0:
            logging.info(f"{inputs.shape[0]} of {n_questions}")

        if human:
            phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type)
        else:
            phi_A, phi_B, preference = get_simulated_feedback(
                env, input_A, input_B, query_type, true_reward, equiv_size
            )

        input_features, normals, preferences = update_response(
            input_features, normals, preferences, phi_A, phi_B, preference, outpath
        )

    save_reward(
        query_type=query_type,
        true_delta=equiv_size,
        w_sampler=Sampler(env.num_of_features),
        n_reward_samples=reward_iterations,
        outdir=outpath,
    )
def collect(
    outdir: Path,
    n_rewards: int,
    test_reward_path: Optional[Path] = None,
    std: Optional[float] = None,
    mean_reward_path: Optional[Path] = None,
    normals_paths: Optional[List[Path]] = None,
    preferences_paths: Optional[List[Path]] = None,
    use_random: bool = False,
    use_plausible: bool = False,
    skip_human: bool = False,
    overwrite: bool = False,
) -> None:
    """Collects ground truth labels for the optimal trajectories of some reward functions.

    Args:
        outdir (Path): Directory to write output to
        n_rewards (int): Number of rewards to generate or process
        test_reward_path (Optional[Path], optional): Path to nupmy array of reward weights to test. Defaults to None.
        std (Optional[float], optional): Standard deviation of normal distribution to draw test reward weigths from. Defaults to None.
        mean_reward_path (Optional[Path], optional): Path to numpy array specifying mean reward weights to sample around. Defaults to None.
        overwrite (bool, optional): Overwrite output? Defaults to False.

    Raises:
        ValueError: Raised if neither test_reward_path or both std and mean_reward_path are specified. The test rewards need to come from somewhere.
    """
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    out_rewards = load(outdir, "test_rewards.npy", overwrite=overwrite)
    new_rewards_index = out_rewards.shape[0] if out_rewards is not None else 0
    num_new_rewards = n_rewards - new_rewards_index

    env = Driver()

    if num_new_rewards > 0:
        if test_reward_path is not None:
            rewards = np.load(
                test_reward_path)[new_rewards_index:num_new_rewards]
        elif mean_reward_path is not None and std is not None:
            mean_reward = np.load(mean_reward_path)
            rewards = default_rng().normal(loc=mean_reward,
                                           scale=std,
                                           size=(num_new_rewards,
                                                 *mean_reward.shape))
        elif normals_paths is not None and preferences_paths is not None and std is not None:
            # NOTE(joschnei): This turned out not to work, because the random baseline is poisoning the well
            normals = None
            for normals_path, preferences_path in zip(normals_paths,
                                                      preferences_paths):
                single_normals = np.load(normals_path)
                single_preferences = np.load(preferences_path)
                single_normals = (single_normals.T * single_preferences).T
                normals = append(normals, single_normals, flat=True)
            # TODO(joschnei): These can all be loaded in from flags.pkl, but I'm too lazy for that.
            mean_reward = make_mode_reward(
                query_type="strict",
                true_delta=1.1,
                w_sampler=Sampler(env.num_of_features),
                n_reward_samples=100,
            )
            assert np.all(np.isfinite(mean_reward))
            rewards = default_rng().normal(loc=mean_reward,
                                           scale=std,
                                           size=(num_new_rewards,
                                                 *mean_reward.shape))
            assert np.all(np.isfinite(rewards))
        elif use_random:
            rewards = default_rng().normal(loc=0,
                                           scale=1,
                                           size=(num_new_rewards,
                                                 env.num_of_features))
            rewards = rewards / np.linalg.norm(rewards)
        elif use_plausible:
            # Generate uniform rewards with plausible weights i.e. ones with the right sign
            rewards = default_rng().normal(loc=0,
                                           scale=1,
                                           size=(num_new_rewards,
                                                 env.num_of_features))
            rewards = rewards / np.linalg.norm(rewards)

            # See models.py for reward feature details.
            rewards[:, 0] = np.abs(rewards[:, 0])
            rewards[:, 1] = -np.abs(rewards[:, 1])
            rewards[:, 2] = np.abs(rewards[:, 2])
            rewards[:, 3] = -np.abs(rewards[:, 3])
        else:
            raise ValueError(
                "You must either supply a path to the test rewards, or a mean reward and "
                "std from which to sample the test rewards.")
        out_rewards = append(out_rewards, rewards, flat=True)
    else:
        assert out_rewards is not None

    assert np.all(np.isfinite(out_rewards))
    np.save(open(outdir / "test_rewards.npy", "wb"), out_rewards)

    paths = load(outdir, "optimal_paths.npy", overwrite=overwrite)
    new_paths_index = paths.shape[0] if paths is not None else 0
    num_new_paths = n_rewards - new_paths_index

    if num_new_paths > 0:
        new_paths = np.array(
            Parallel(n_jobs=-2)(delayed(make_opt_traj)(reward)
                                for reward in out_rewards[new_paths_index:]))
        paths = append(paths, new_paths, flat=True)
    else:
        assert paths is not None
    np.save(open(outdir / "optimal_paths.npy", "wb"), np.array(paths))

    gt_alignment = load(outdir, "alignment.npy", overwrite=overwrite)
    new_gt_index = gt_alignment.size if gt_alignment is not None else 0

    if skip_human:
        exit()

    for path in paths[new_gt_index:]:
        env.set_ctrl(path)
        env.watch(1)

        alignment = input("Aligned (y/n):").lower()
        while alignment not in ["y", "n"]:
            alignment = input("Aligned (y/n):").lower()
        gt_alignment = append(gt_alignment, alignment == "y")

    np.save(open(outdir / "alignment.npy", "wb"), gt_alignment)