Пример #1
0
def make_opt_trajs(
    traj_opt: TrajOptimizer,
    rewards: np.ndarray,
    starts: np.ndarray,
    log_time: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    trajs = []
    losses = []
    times = []
    for reward, start_state in zip(rewards, starts):
        start = perf_counter()
        traj, loss = traj_opt.make_opt_traj(reward, start_state, return_loss=True)
        stop = perf_counter()

        trajs.append(traj)
        losses.append(loss)

        times.append(stop - start)

    trajs_array = np.array(trajs)
    assert len(trajs_array.shape) == 3
    assert trajs_array.shape[1:] == (50, 2)

    if log_time:
        logging.info(f"Mean traj opt time={np.mean(times)}")
    return trajs_array, np.array(losses)
def align_worker(
        rewards: np.ndarray,
        states: np.ndarray,
        optim: TrajOptimizer,
        action_shape: Tuple[int, ...] = (2, ),
):
    batch_size = rewards.shape[0]
    assert states.shape[0] == batch_size
    plans = np.empty((batch_size, 50, *action_shape))
    for i, (reward, state) in enumerate(zip(rewards, states)):
        traj, _ = optim.make_opt_traj(reward, state)
        plans[i] = traj.reshape(-1, *action_shape)

    return plans
def make_plans(
    rewards: np.ndarray,
    states: np.ndarray,
    optim: TrajOptimizer,
    parallel: Optional[Parallel] = None,
    action_shape: Tuple[int, ...] = (2, ),
    memorize: bool = False,
) -> np.ndarray:

    assert shape_compat(
        rewards,
        (-1, 4)), f"rewards shape={rewards.shape} is wrong, expected (-1, 4)"

    if parallel is not None:
        input_batches = np.array_split(list(product(rewards, states)),
                                       parallel.n_jobs)

        logging.debug("Branching")

        return np.concatenate(
            parallel(
                delayed(align_worker)(
                    rewards=batch[:, 0],
                    states=batch[:, 1],
                    optim=optim,
                    action_shape=action_shape,
                )
                for batch in input_batches)).reshape(len(rewards), len(states),
                                                     50, *action_shape)
    else:
        plans = np.empty((len(rewards), len(states), 50, *action_shape))
        for i, reward in enumerate(rewards):
            assert reward.shape == (4, )
            for j, state in enumerate(states):
                traj, _ = optim.make_opt_traj(reward, state, memorize=memorize)
                plans[i, j] = traj.reshape(-1, *action_shape)
        return plans
def make_test_rewards(
    epsilons: Sequence[float],
    true_reward: np.ndarray,
    n_rewards: int,
    outdir: Path,
    parallel: Parallel,
    n_test_states: Optional[int] = None,
    traj_opt: bool = False,
    max_attempts: int = 10,
    n_gt_test_questions: Optional[int] = None,
    use_equiv: bool = False,
    overwrite: bool = False,
) -> Dict[float, Tuple[np.ndarray, np.ndarray]]:
    """ Makes test rewards sets for every epsilon and saves them to a file. """
    traj_optimizer = (TrajOptimizer(n_planner_iters=100,
                                    optim=tf.keras.optimizers.Adam(0.2))
                      if traj_opt else None)

    reward_path = outdir / "test_rewards.pkl"

    test_rewards: Dict[float, Tuple[np.ndarray,
                                    np.ndarray]] = load(reward_path,
                                                        overwrite=overwrite)
    if test_rewards is None:
        test_rewards = {}
    else:
        logging.info(f"Loading test rewards from {reward_path}")

    new_epsilons = set(epsilons) - test_rewards.keys()

    if len(new_epsilons) > 0:
        logging.info(f"Creating new test rewards for epsilons: {new_epsilons}")

    if (n_test_states is not None
            and n_test_states > 1) or len(new_epsilons) == 1:
        # Parallelize internally
        test_rewards.update({
            epsilon: find_reward_boundary(
                true_reward=true_reward,
                traj_optimizer=traj_optimizer,
                n_rewards=n_rewards,
                use_equiv=use_equiv,
                epsilon=epsilon,
                n_test_states=n_test_states,
                max_attempts=max_attempts,
                outdir=outdir,
                n_gt_test_questions=n_gt_test_questions,
                overwrite=overwrite,
                parallel=parallel,
            )[:2]
            for epsilon in new_epsilons
        })
    else:
        for rewards, alignment, epsilon in parallel(
                delayed(find_reward_boundary)(
                    true_reward=true_reward,
                    traj_optimizer=traj_optimizer,
                    n_rewards=n_rewards,
                    use_equiv=use_equiv,
                    epsilon=epsilon,
                    n_test_states=n_test_states,
                    max_attempts=max_attempts,
                    n_gt_test_questions=n_gt_test_questions,
                    outdir=outdir,
                    overwrite=overwrite,
                    parallel=None,
                ) for epsilon in new_epsilons):
            test_rewards[epsilon] = (rewards, alignment)

    logging.info(f"Writing generated test rewards to {reward_path}")
    pkl.dump(test_rewards, open(reward_path, "wb"))
    return test_rewards
def compare_test_labels(
    test_rewards_path: Path,
    true_reward_path: Path,
    traj_opt: bool = False,
    elicitation: bool = False,
    replications: Optional[str] = None,
    normals_path: Optional[Path] = None,
):
    if replications is not None:
        raise NotImplementedError("Replications not yet implemented")

    starting_tests: Dict[float, Tuple[np.ndarray, np.ndarray]] = pkl.load(
        open(test_rewards_path, "rb"))

    assert not (traj_opt == elicitation
                ), "Provided labels must come from exactly one source"

    class Test(NamedTuple):
        rewards: np.ndarray
        q_labels: np.ndarray
        elicitation_labels: np.ndarray

    test_rewards: Dict[float, Test] = {}
    true_reward = np.load(true_reward_path)
    if traj_opt:
        normals = np.load(normals_path)

        for epsilon, (rewards, q_labels) in starting_tests.items():
            normals = normals[true_reward @ normals.T > epsilon]
            elicitation_labels = run_test(normals, rewards, use_equiv=False)

            test_rewards[epsilon] = Test(rewards=rewards,
                                         q_labels=q_labels,
                                         elicitation_labels=elicitation_labels)
    elif elicitation:
        parallel = Parallel(n_cpus=-4)
        env = LegacyEnv(reward=true_reward, random_start=True)
        traj_optimizer = TrajOptimizer(10)
        for epsilon, (rewards, elicitation_labels) in starting_tests.items():
            q_labels = rewards_aligned(
                traj_optimizer=traj_optimizer,
                env=env,
                true_reward=true_reward,
                test_rewards=rewards,
                epsilon=epsilon,
                parallel=parallel,
            )

            test_rewards[epsilon] = Test(rewards=rewards,
                                         q_labels=q_labels,
                                         elicitation_labels=elicitation_labels)

    total_agree = 0
    total_rewards = 0
    for epsilon, test in test_rewards.items():
        total_agree += np.sum(test.q_labels == test.elicitation_labels)
        total_rewards += len(test.rewards)

    print(
        f"Critic and superset labels agree on {total_agree / total_rewards * 100 :.1f}% of rewards"
    )
def compare(
    reward_path: Path,
    td3_dir: Path,
    outdir: Path,
    planner_iters: int = 10,
    random_start: bool = False,
    n_starts: int = 1,
    replications: Optional[str] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s")
    if replications is not None:
        replication_indices = parse_replications(replications)
        td3_paths = make_td3_paths(Path(td3_dir), replication_indices)
        for replication, td3_path in zip(replication_indices, td3_paths):
            compare(
                reward_path=Path(reward_path) / str(replication) / "true_reward.npy",
                outdir=Path(outdir) / str(replication),
                td3_dir=td3_path,
                planner_iters=planner_iters,
                random_start=random_start,
                n_starts=n_starts,
                verbosity=verbosity,
            )
        exit()

    reward_weights: np.ndarray = np.load(reward_path).astype(np.float32)
    env = gym.make("LegacyDriver-v1", reward=reward_weights, random_start=random_start)
    td3 = load_td3(env, td3_dir)

    traj_optimizer = TrajOptimizer(planner_iters)

    class BadPlannerCollection:
        def __init__(self):
            self.states = None
            self.rewards = None
            self.trajs = None

        def append(
            self, state: np.ndarray, reward: np.ndarray, traj: np.ndarray
        ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
            if self.states is None:
                self.states = np.array([state])
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.array([reward])
                self.trajs = np.array([traj])
            else:
                self.states = np.append(self.states, [state], axis=0)
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.append(self.rewards, [reward], axis=0)
                self.trajs = np.append(self.trajs, [traj], axis=0)

            self.check_shapes()

            return self.get()

        def check_shapes(self):
            assert len(self.states.shape) == 3
            assert len(self.rewards.shape) == 2
            assert len(self.trajs.shape) == 3

            assert self.states.shape[1:] == (2, 4)
            assert self.rewards.shape[1] == 4
            assert self.trajs.shape[1:] == (50, 2)

        def get(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
            return self.states, self.rewards, self.trajs

    planner_bad = BadPlannerCollection()

    returns = np.empty((n_starts, 2))
    for i in range(n_starts):
        logging.info(f"{i+1}/{n_starts}")
        start_state: np.ndarray = env.reset()

        logging.info("Optimizing traj")
        opt_traj = traj_optimizer.make_opt_traj(reward_weights, start_state)

        logging.info("Executing traj")
        opt_return = 0.0
        for action in opt_traj:
            state, reward, done, info = env.step(action)
            opt_return += reward

        opt_return = opt_return / len(opt_traj)

        logging.info("Evaluating policy")
        empirical_return, traj = eval(
            reward_weights=reward_weights,
            td3=td3,
            start_state=start_state,
            time_in_state=False,
            return_actions=True,
        )

        returns[i] = empirical_return, opt_return

        if opt_return < empirical_return:
            planner_bad.append(start_state, reward_weights, traj)

    outdir.mkdir(parents=True, exist_ok=True)
    plot_dir = outdir / "comparison_plots"
    plot_dir.mkdir(parents=True, exist_ok=True)

    plt.hist(returns[:, 0], label="Empirical", alpha=0.5)
    plt.hist(returns[:, 1], label="Optimal", alpha=0.5)
    plt.title("Histogram of Optimal vs Empirical returns")
    plt.legend()
    plt.savefig(plot_dir / "returns.png")
    plt.close()

    regret = returns[:, 1] - returns[:, 0]
    plt.hist(regret)
    plt.title("Histogram of regret")
    plt.savefig(plot_dir / "regret.png")
    plt.close()
    logging.info(f"Average regret = {np.mean(regret)}, min={np.min(regret)}, max={np.max(regret)}")

    pickle.dump(planner_bad.get(), (outdir / "planner_mistakes.pkl").open("wb"))
Пример #7
0
def main(
    mistakes_path: Path,
    outdir: Path,
    plan_iters: int = 10,
    optim: Literal["sgd", "adam"] = "sgd",
    lr: float = 0.1,
    momentum: bool = False,
    nesterov: bool = False,
    extra_inits: bool = False,
    replications: Optional[str] = None,
    log_time: bool = False,
    log_best_inits: bool = False,
    n_traj_max: Optional[int] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    outdir = Path(outdir)
    experiment_dir = outdir / make_experiment(
        optim, lr, plan_iters, momentum, nesterov, extra_inits
    )
    experiment_dir.mkdir(parents=True, exist_ok=True)

    setup_logging(verbosity=verbosity, log_path=experiment_dir / "log.txt")

    if replications is not None:
        replication_indices = parse_replications(replications)
        mistakes_paths = [
            Path(mistakes_path) / str(index) / "planner_mistakes.pkl"
            for index in replication_indices
        ]
    else:
        mistakes_paths = [Path(mistakes_path)]

    if optim == "sgd":
        optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov)
    elif optim == "adam":
        optimizer = Adam(learning_rate=lr)

    env = LegacyEnv(reward=np.zeros(4))

    starts, rewards, better_trajs = collect_mistakes(
        mistakes_paths=mistakes_paths, n_max=n_traj_max
    )

    init_controls = (
        np.array(
            [
                [[0.0, 1.0]] * 50,
                [[0.0, -1.0]] * 50,
                [[-0.5, -1.0]] * 50,
                [[0.5, -1.0]] * 50,
                [[0.5, 1.0]] * 50,
                [[-0.5, 1.0]] * 50,
            ]
        )
        if extra_inits
        else None
    )

    logging.info("Making trajectories")
    opt_trajs, losses = make_opt_trajs(
        traj_opt=TrajOptimizer(
            n_planner_iters=plan_iters,
            optim=optimizer,
            init_controls=init_controls,
            log_best_init=log_best_inits,
        ),
        rewards=rewards,
        starts=starts,
        log_time=log_time,
    )

    logging.info("Rolling out trajectories")
    returns = np.empty((len(starts), 2))
    for i, (start, reward_weights, opt_traj, policy_traj, loss) in enumerate(
        zip(starts, rewards, opt_trajs, better_trajs, losses)
    ):
        env.reward = reward_weights

        traj_opt_return = rollout(actions=opt_traj, env=env, start=start)
        policy_return = rollout(actions=policy_traj, env=env, start=start)

        assert (
            abs(traj_opt_return + loss) < 0.001
        ), f"Rollout={traj_opt_return} and loss={loss}, differ by too much. start={start}, reward={reward_weights}"

        returns[i, 0] = traj_opt_return
        returns[i, 1] = policy_return

        logging.debug(
            f"Traj opt return={traj_opt_return}, loss={loss}, policy_return={policy_return}, delta={traj_opt_return-policy_return}"
        )

    np.save(experiment_dir / "returns.npy", returns)

    deltas = returns[:, 0] - returns[:, 1]

    logging.info(
        f"Mean delta={np.mean(deltas)}, mean better={np.mean(deltas > 0)*100:.1f}%, optim={optim}, lr={lr}, n={plan_iters}, momentum={momentum}, nesterov={nesterov}, extra inits={extra_inits}"
    )

    plot_returns(returns, experiment_dir)