def convert_legacy_td3(filename: Union[str, Path], replications: Optional[str] = None) -> None:
    # We used to include reward features as part of state, but now they're extra dimensions. This
    # function converts from the old encoding to the new one.

    if replications is not None:
        replication_indices = parse_replications(replications)
        td3_paths = make_td3_paths(Path(filename), replication_indices)
        for td3_path in td3_paths:
            convert_legacy_td3(td3_path)
        exit()

    # We can't do this if we're adding more dimensions. We just have car state + reward features
    explicit_args = pickle.load(open(str(filename) + ".meta.pkl", "rb"), fix_imports=True)
    assert "extra_state_dim" not in explicit_args.keys()

    env = gym.make("LegacyDriver-v1", reward=np.zeros(4))

    state_dim = np.prod(env.observation_space.shape)
    action_dim = np.prod(env.action_space.shape)

    explicit_args["extra_state_dim"] = 4
    max_action = max(np.max(env.action_space.high), -np.min(env.action_space.low))
    td3 = Td3(
        state_dim=state_dim,
        action_dim=action_dim,
        max_action=max_action,
        **explicit_args,
    )
    td3.load(str(filename))

    pickle.dump(explicit_args, open(str(filename) + ".meta.pkl", "wb"))
def refine(
    reward_path: Path,
    td3_dir: Path,
    env_iters: int = int(1e5),
    batch_size: int = 100,
    replications: Optional[Union[str, Tuple[int, ...]]] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    logging.basicConfig(level=verbosity)
    if replications is not None:
        replication_indices = parse_replications(replications)
        reward_dir, reward_name = make_reward_path(reward_path)
        td3_paths = make_td3_paths(Path(td3_dir), replication_indices)
        Parallel(n_jobs=-2)(
            delayed(refine)(
                reward_path=reward_dir / str(i) / reward_name,
                td3_dir=td3_path,
                env_iters=env_iters,
                batch_size=batch_size,
            )
            for i, td3_path in zip(replication_indices, td3_paths)
        )
        exit()
    td3_dir = Path(td3_dir)
    writer = SummaryWriter(log_dir=td3_dir.parent, filename_suffix="refine")

    reward_weights = np.load(reward_path)
    env = gym.make("LegacyDriver-v1", reward=reward_weights)
    td3 = load_td3(env, td3_dir, writer=writer)

    buffer = ReplayBuffer(td3.state_dim, td3.action_dim, writer=writer)
    logging.info("Initialized TD3 algorithm")

    raw_state = env.reset()
    state = make_TD3_state(raw_state, reward_features=env.features(raw_state))

    for t in range(env_iters):
        action = td3.select_action(state)
        next_raw_state, reward, done, info = env.step(action)
        log_step(next_raw_state, action, reward, info, log_iter=t, writer=writer)

        reward_features = info["reward_features"]

        if done:
            assert t % env.HORIZON == env.HORIZON - 1, f"Done at t={t} when horizon={env.HORIZON}"
            next_raw_state = env.reset()
            next_state = make_TD3_state(
                next_raw_state, reward_features=env.main_car.features(next_raw_state, None).numpy()
            )
        else:
            next_state = make_TD3_state(next_raw_state, reward_features)

        # Store data in replay buffer
        buffer.add(state, action, next_state, reward, done=float(done))

        state = next_state

        td3.update_critic(*buffer.sample(batch_size))

        td3.save(str(td3_dir) + "_refined")
def premake_test_rewards(
    epsilons: List[float] = [0.0],
    n_rewards: int = 100,
    n_test_states: Optional[int] = None,
    n_gt_test_questions: int = 10000,
    true_reward_name: Path = Path("true_reward.npy"),
    datadir: Path = Path(),
    outdir: Path = Path(),
    replications: Optional[Union[str, Tuple[int, ...]]] = None,
    n_cpus: int = 1,
    overwrite: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    """ Finds test rewards for each experiment. """
    outdir.mkdir(parents=True, exist_ok=True)
    # TODO(joschnei): I'm making some dangerous logging decisions. Do I want to append to logs, or
    # give logs unique names? I really need to pick at least one.
    setup_logging(verbosity, log_path=outdir / "log.txt")

    if replications is not None:
        replication_indices = parse_replications(replications)

        for replication in replication_indices:
            if not (datadir / str(replication)).exists():
                logging.warning(
                    f"Replication {replication} does not exist, skipping")
                continue

            premake_test_rewards(
                epsilons=epsilons,
                n_rewards=n_rewards,
                n_test_states=n_test_states,
                n_gt_test_questions=n_gt_test_questions,
                true_reward_name=true_reward_name,
                datadir=datadir / str(replication),
                outdir=outdir / str(replication),
                use_equiv=use_equiv,
                n_cpus=n_cpus,
                overwrite=overwrite,
                verbosity=verbosity,
            )
            logging.info(f"Done with replication {replication}")
        exit()

    true_reward = np.load(datadir / true_reward_name)
    assert_reward(true_reward, False, 4)

    with Parallel(n_jobs=n_cpus) as parallel:
        make_test_rewards(
            epsilons=epsilons,
            true_reward=true_reward,
            n_rewards=n_rewards,
            n_test_states=n_test_states,
            n_gt_test_questions=int(n_gt_test_questions),
            outdir=outdir,
            parallel=parallel,
            use_equiv=use_equiv,
            overwrite=overwrite,
        )
def simulated(
    epsilons: List[float] = [0.0],
    n_rewards: int = 100,
    human_samples: List[int] = [1],
    n_reward_samples: int = 1000,
    n_test_states: Optional[int] = None,
    n_gt_test_questions: int = 10000,
    traj_opt: bool = False,
    datadir: Path = Path(),
    outdir: Path = Path(),
    deltas: List[Optional[float]] = [None],
    use_mean_reward: bool = False,
    use_random_test_questions: bool = False,
    n_random_test_questions: Optional[int] = None,
    use_cheating_questions: bool = False,
    skip_remove_duplicates: bool = False,
    skip_epsilon_filtering: bool = False,
    skip_redundancy_filtering: bool = False,
    use_true_epsilon: bool = False,
    legacy_test_rewards: bool = False,
    replications: Optional[Union[str, Tuple[int, ...]]] = None,
    n_cpus: int = 1,
    overwrite_test_rewards: bool = False,
    overwrite_results: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
) -> None:
    """ Evaluates alignment test generated by ground-truth rewards. """
    logging.basicConfig(level=verbosity,
                        format="%(levelname)s:%(asctime)s:%(message)s")

    if replications is not None:
        replication_indices = parse_replications(replications)

        for replication in replication_indices:
            if not (datadir / str(replication)).exists():
                logging.warning(
                    f"Replication {replication} does not exist, skipping")
                continue

            logging.info(f"Starting replication {replication}")

            simulated(
                epsilons=epsilons,
                deltas=deltas,
                n_rewards=n_rewards,
                human_samples=human_samples,
                n_reward_samples=n_reward_samples,
                n_test_states=n_test_states,
                n_gt_test_questions=n_gt_test_questions,
                datadir=datadir / str(replication),
                outdir=outdir / str(replication),
                use_mean_reward=use_mean_reward,
                use_random_test_questions=use_random_test_questions,
                use_cheating_questions=use_cheating_questions,
                n_random_test_questions=n_random_test_questions,
                skip_remove_duplicates=skip_remove_duplicates,
                skip_epsilon_filtering=skip_epsilon_filtering,
                skip_redundancy_filtering=skip_redundancy_filtering,
                use_true_epsilon=use_true_epsilon,
                legacy_test_rewards=legacy_test_rewards,
                n_cpus=n_cpus,
                overwrite_test_rewards=overwrite_test_rewards,
                overwrite_results=overwrite_results,
                verbosity=verbosity,
            )
        exit()

    logging.info(f"Using {n_cpus} cpus.")
    parallel = Parallel(n_jobs=n_cpus)

    outdir.mkdir(parents=True, exist_ok=True)

    if n_random_test_questions is not None:
        # Argh defaults to parsing something as a string if its optional
        n_random_test_questions = int(n_random_test_questions)

    flags = pkl.load(open(datadir / flags_name, "rb"))
    query_type = flags["query_type"]
    equiv_probability = flags["equiv_size"]

    env = Driver()
    n_reward_features = env.num_of_features

    logging.info("Loading elicitation results")
    elicited_normals, elicited_preferences, elicited_input_features = load_elicitation(
        datadir=datadir,
        normals_name=normals_name,
        preferences_name=preferences_name,
        input_features_name=input_features_name,
        n_reward_features=n_reward_features,
        use_equiv=use_equiv,
        query_type=query_type,
        equiv_probability=equiv_probability,
    )
    true_reward = np.load(datadir / true_reward_name)
    assert_reward(true_reward, False, n_reward_features)

    if use_equiv:
        true_reward = np.append(true_reward, [1])
    else:
        assert not np.any(elicited_preferences == 0)

    factory = TestFactory(
        query_type=query_type,
        reward_dimension=elicited_normals.shape[1],
        equiv_probability=equiv_probability,
        n_reward_samples=n_reward_samples,
        use_mean_reward=use_mean_reward,
        skip_dedup=skip_remove_duplicates,
        skip_noise_filtering=True,
        skip_epsilon_filtering=skip_epsilon_filtering,
        skip_redundancy_filtering=skip_redundancy_filtering,
        use_true_epsilon=use_true_epsilon,
        true_reward=true_reward,
    )
    logging.info(f"""Filtering settings:
    # reward samples={n_reward_samples},
    use mean reward={use_mean_reward},
    skip duplicates={skip_remove_duplicates}
    skip noise={True}
    skip epsilon={skip_epsilon_filtering}
    skip redundancy={skip_redundancy_filtering}
    use true epsilon={use_true_epsilon}
    """)

    confusion_path, test_path = make_outnames(
        outdir,
        skip_remove_duplicates,
        True,
        skip_epsilon_filtering,
        skip_redundancy_filtering,
    )
    confusions: Dict[Experiment, np.ndarray] = load(confusion_path,
                                                    overwrite_results,
                                                    default={})
    minimal_tests: Dict[Experiment, np.ndarray] = load(test_path,
                                                       overwrite_results,
                                                       default={})

    experiments = make_experiments(epsilons,
                                   deltas,
                                   human_samples,
                                   overwrite_results,
                                   experiments=set(minimal_tests.keys()))

    if use_random_test_questions:
        logging.info("Making random test")
        logging.info(f"True reward: {true_reward}")
        normals, preferences, input_features = make_random_test(
            n_random_test_questions,
            elicited_input_features,
            elicited_preferences,
            reward_iterations=flags["reward_iterations"],
            query_type=query_type,
            equiv_size=flags["equiv_size"],
            sim=env,
            use_equiv=use_equiv,
        )

        good_indices = (true_reward @ normals.T) > 0

        logging.info(
            f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward."
        )

        if use_cheating_questions:
            logging.info(f"Selecting only questions consistent with gt reward")
            normals = normals[good_indices]
            preferences = preferences[good_indices]
            input_features = input_features[good_indices]

        assert_normals(normals, use_equiv)
    else:
        max_n = max(human_samples)
        preferences = elicited_preferences[:max_n]
        input_features = elicited_input_features[:max_n]
        logging.debug(f"elicited_normals={elicited_normals[:10]}")
        normals = orient_normals(elicited_normals[:max_n], preferences,
                                 use_equiv, n_reward_features)
        logging.debug(f"normals={normals[:10]}")

        assert np.all(true_reward @ normals.T >= 0)

    if not legacy_test_rewards:
        test_rewards = make_test_rewards(
            epsilons=epsilons,
            true_reward=true_reward,
            n_rewards=n_rewards,
            n_test_states=n_test_states,
            n_gt_test_questions=int(n_gt_test_questions),
            traj_opt=traj_opt,
            outdir=outdir,
            parallel=parallel,
            use_equiv=use_equiv,
            overwrite=overwrite_test_rewards,
        )
    else:
        test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward,
                                                epsilons, use_equiv)

    for indices, confusion, experiment in parallel(
            delayed(run_gt_experiment)(
                normals=normals,
                test_rewards=test_rewards[epsilon][0],
                test_reward_alignment=test_rewards[epsilon][1],
                epsilon=epsilon,
                delta=delta,
                use_equiv=use_equiv,
                n_human_samples=n,
                factory=factory,
                input_features=input_features,
                preferences=preferences,
                outdir=outdir,
                verbosity=verbosity,
            ) for epsilon, delta, n in experiments):
        minimal_tests[experiment] = indices
        confusions[experiment] = confusion

    pkl.dump(confusions, open(confusion_path, "wb"))
    pkl.dump(minimal_tests, open(test_path, "wb"))
def simulated(
    outdir: Path,
    criterion: Literal["information", "volume", "random"],
    termination_threshold: float,
    n_reward_samples: int,
    query_type: Literal["strict", "weak"] = "strict",
    equiv_size: Optional[float] = None,
    true_reward_path: Optional[Path] = None,
    continuous: bool = False,
    overwrite: bool = False,
    replicaitons: Optional[str] = None,
):
    """ Generates a test by eliciting from a human simulated by a ground truth reward. """
    if replicaitons is not None:
        replication_indices = parse_replications(replicaitons)
        if true_reward_path is not None:
            reward_dir, reward_name = make_reward_path(true_reward_path)
            Parallel(n_jobs=-2)(
                delayed(simulated)(
                    outdir=Path(outdir) / str(i),
                    criterion=criterion,
                    termination_threshold=termination_threshold,
                    n_reward_smaples=n_reward_samples,
                    query_type=query_type,
                    equiv_size=equiv_size,
                    true_reward_path=reward_dir / str(i) / reward_name,
                    continuous=continuous,
                    overwrite=overwrite,
                )
                for i in replication_indices
            )
        else:
            Parallel(n_jobs=-2)(
                delayed(simulated)(
                    outdir=Path(outdir) / str(i),
                    criterion=criterion,
                    termination_threshold=termination_threshold,
                    n_reward_smaples=n_reward_samples,
                    query_type=query_type,
                    equiv_size=equiv_size,
                    continuous=continuous,
                    overwrite=overwrite,
                )
                for i in replication_indices
            )
        exit()

    criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size)

    env = Driver()
    d = env.num_of_features

    if true_reward_path is not None:
        logging.info(f"Loading true reward from {true_reward_path}")
        true_reward = np.load(true_reward_path)
    else:
        logging.info("Randomly generating true reward")
        true_reward = np.random.normal(size=(4,))
        true_reward = true_reward / np.linalg.norm(true_reward)
        np.save(outdir / "true_reward.npy", true_reward)

    pickle.dump(
        {
            "criterion": criterion,
            "reward_iterations": n_reward_samples,
            "stop_thresh": termination_threshold,
            "query_type": query_type,
            "equiv_size": equiv_size,
            "continuous": continuous,
        },
        open(outdir / "flags.pkl", "wb"),
    )

    normals = load(outdir / "normals.npy", overwrite=overwrite)
    preferences = load(outdir / "preferences.npy", overwrite=overwrite)
    inputs = load(outdir / "inputs.npy", overwrite=overwrite)
    input_features = load(outdir / "input_features.npy", overwrite=overwrite)

    # If there is already data, feed it to the w_sampler to get the right posterior.
    w_sampler = Sampler(d)
    if inputs is not None and input_features is not None and preferences is not None:
        for (a_phi, b_phi), preference in zip(input_features, preferences):
            w_sampler.feed(a_phi, b_phi, [preference])

    score = np.inf
    try:
        while score >= termination_threshold:
            w_samples, delta_samples = w_sampler.sample_given_delta(
                sample_count=n_reward_samples, query_type=query_type, delta=equiv_size
            )

            input_A, input_B, score = run_algo(criterion, env, w_samples, delta_samples, continuous)
            logging.info(f"Score={score}")

            if score > termination_threshold:
                inputs = update_inputs(
                    a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir
                )
                phi_A, phi_B, preference = get_simulated_feedback(
                    simulation=env,
                    input_A=input_A,
                    input_B=input_B,
                    query_type=query_type,
                    true_reward=true_reward,
                    delta=equiv_size,
                )
                input_features = append(input_features, np.stack([phi_A, phi_B]))
                normals = append(normals, phi_A - phi_B)
                preferences = append(preferences, preference)
                np.save(outdir / "input_features.npy", input_features)
                np.save(outdir / "normals.npy", normals)
                np.save(outdir / "preferences.npy", preferences)

                w_sampler.feed(phi_A, phi_B, [preference])
    except KeyboardInterrupt:
        # Pass through to finally
        logging.warning("\nSaving results, please do not exit again.")
    finally:
        save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def main(
    n_questions: int,
    query_type: Literal["strict", "weak"] = "strict",
    equiv_size: float = 1.1,
    reward_iterations: int = 100,
    outdir: Path = Path("data/simulated/random/elicitation"),
    human: bool = False,
    reward_path: Optional[Path] = None,
    replications: Optional[str] = None,
    overwrite: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
) -> None:
    outpath = Path(outdir)
    outpath.mkdir(parents=True, exist_ok=True)
    setup_logging(verbosity=verbosity, log_path=outpath / "log.txt")

    if not human:
        assert reward_path is not None
        reward_dir, reward_name = make_reward_path(reward_path)
        reward_path = reward_dir / reward_name

    if replications is not None:
        replication_indices = parse_replications(replications)
        n_cpus = min(multiprocessing.cpu_count() - 4, len(replication_indices))
        Parallel(n_jobs=n_cpus)(
            delayed(main)(
                n_questions=n_questions,
                query_type=query_type,
                equiv_size=equiv_size,
                reward_iterations=reward_iterations,
                outdir=outpath / str(i),
                human=human,
                reward_path=reward_dir / str(i) / reward_name,
                overwrite=overwrite,
                verbosity=verbosity,
            )
            for i in replication_indices
        )
        exit()

    if not human:
        assert reward_path is not None
        if not reward_path.exists():
            logging.warning("Reward path given does not exist, generating random reward.")
            true_reward = np.random.default_rng().normal(loc=0, scale=1, size=(4,))
            true_reward = safe_normalize(true_reward)
            np.save(reward_path, true_reward)
        else:
            true_reward = np.load(reward_path)

    pickle.dump(
        {
            "n_questions": n_questions,
            "query_type": query_type,
            "equiv_size": equiv_size,
            "reward_iterations": reward_iterations,
            "human": human,
        },
        open(outpath / "flags.pkl", "wb"),
    )

    normals = load(outpath / "normals.npy", overwrite=overwrite)
    preferences = load(outpath / "preferences.npy", overwrite=overwrite)
    # TODO(joschnei): Make class for inputs, dimensions are too difficult to reason about
    # (N, 2, 100)
    inputs = load(outpath / "inputs.npy", overwrite=overwrite)
    input_features = load(outpath / "input_features.npy", overwrite=overwrite)

    env = Driver()

    if (
        inputs is not None
        and input_features is not None
        and inputs.shape[0] > input_features.shape[0]
    ):
        logging.info("Catching up to previously generated trajectories.")
        input_A, input_B = inputs[-1]

        if human:
            phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type)
        else:
            phi_A, phi_B, preference = get_simulated_feedback(
                env, input_A, input_B, query_type, true_reward, equiv_size
            )

        input_features, normals, preferences = update_response(
            input_features, normals, preferences, phi_A, phi_B, preference, outpath
        )

    # Questions and inputs are duplicated, but this keeps everything consistent for the hot-load case
    new_questions = n_questions - inputs.shape[0] if inputs is not None else n_questions
    questions = make_random_questions(n_questions=new_questions, env=env)
    logging.debug(f"questions={questions[:10]}")

    if inputs is not None:
        assert input_features is not None
        assert normals is not None
        assert preferences is not None
        assert inputs.shape[0] == input_features.shape[0]
        assert inputs.shape[0] == normals.shape[0]
        assert inputs.shape[0] == preferences.shape[0]

    for input_A, input_B in questions:
        inputs = update_inputs(input_A, input_B, inputs, outpath)

        if inputs.shape[0] % 10 == 0:
            logging.info(f"{inputs.shape[0]} of {n_questions}")

        if human:
            phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type)
        else:
            phi_A, phi_B, preference = get_simulated_feedback(
                env, input_A, input_B, query_type, true_reward, equiv_size
            )

        input_features, normals, preferences = update_response(
            input_features, normals, preferences, phi_A, phi_B, preference, outpath
        )

    save_reward(
        query_type=query_type,
        true_delta=equiv_size,
        w_sampler=Sampler(env.num_of_features),
        n_reward_samples=reward_iterations,
        outdir=outpath,
    )
def compare(
    reward_path: Path,
    td3_dir: Path,
    outdir: Path,
    planner_iters: int = 10,
    random_start: bool = False,
    n_starts: int = 1,
    replications: Optional[str] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s")
    if replications is not None:
        replication_indices = parse_replications(replications)
        td3_paths = make_td3_paths(Path(td3_dir), replication_indices)
        for replication, td3_path in zip(replication_indices, td3_paths):
            compare(
                reward_path=Path(reward_path) / str(replication) / "true_reward.npy",
                outdir=Path(outdir) / str(replication),
                td3_dir=td3_path,
                planner_iters=planner_iters,
                random_start=random_start,
                n_starts=n_starts,
                verbosity=verbosity,
            )
        exit()

    reward_weights: np.ndarray = np.load(reward_path).astype(np.float32)
    env = gym.make("LegacyDriver-v1", reward=reward_weights, random_start=random_start)
    td3 = load_td3(env, td3_dir)

    traj_optimizer = TrajOptimizer(planner_iters)

    class BadPlannerCollection:
        def __init__(self):
            self.states = None
            self.rewards = None
            self.trajs = None

        def append(
            self, state: np.ndarray, reward: np.ndarray, traj: np.ndarray
        ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
            if self.states is None:
                self.states = np.array([state])
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.array([reward])
                self.trajs = np.array([traj])
            else:
                self.states = np.append(self.states, [state], axis=0)
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.append(self.rewards, [reward], axis=0)
                self.trajs = np.append(self.trajs, [traj], axis=0)

            self.check_shapes()

            return self.get()

        def check_shapes(self):
            assert len(self.states.shape) == 3
            assert len(self.rewards.shape) == 2
            assert len(self.trajs.shape) == 3

            assert self.states.shape[1:] == (2, 4)
            assert self.rewards.shape[1] == 4
            assert self.trajs.shape[1:] == (50, 2)

        def get(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
            return self.states, self.rewards, self.trajs

    planner_bad = BadPlannerCollection()

    returns = np.empty((n_starts, 2))
    for i in range(n_starts):
        logging.info(f"{i+1}/{n_starts}")
        start_state: np.ndarray = env.reset()

        logging.info("Optimizing traj")
        opt_traj = traj_optimizer.make_opt_traj(reward_weights, start_state)

        logging.info("Executing traj")
        opt_return = 0.0
        for action in opt_traj:
            state, reward, done, info = env.step(action)
            opt_return += reward

        opt_return = opt_return / len(opt_traj)

        logging.info("Evaluating policy")
        empirical_return, traj = eval(
            reward_weights=reward_weights,
            td3=td3,
            start_state=start_state,
            time_in_state=False,
            return_actions=True,
        )

        returns[i] = empirical_return, opt_return

        if opt_return < empirical_return:
            planner_bad.append(start_state, reward_weights, traj)

    outdir.mkdir(parents=True, exist_ok=True)
    plot_dir = outdir / "comparison_plots"
    plot_dir.mkdir(parents=True, exist_ok=True)

    plt.hist(returns[:, 0], label="Empirical", alpha=0.5)
    plt.hist(returns[:, 1], label="Optimal", alpha=0.5)
    plt.title("Histogram of Optimal vs Empirical returns")
    plt.legend()
    plt.savefig(plot_dir / "returns.png")
    plt.close()

    regret = returns[:, 1] - returns[:, 0]
    plt.hist(regret)
    plt.title("Histogram of regret")
    plt.savefig(plot_dir / "regret.png")
    plt.close()
    logging.info(f"Average regret = {np.mean(regret)}, min={np.min(regret)}, max={np.max(regret)}")

    pickle.dump(planner_bad.get(), (outdir / "planner_mistakes.pkl").open("wb"))
def train(
    outdir: Path,
    reward_path: Path = Path(),
    actor_layers: List[int] = [256, 256],
    critic_layers: List[int] = [256, 256],
    dense: bool = False,
    use_reward_features: bool = True,
    n_timesteps: int = int(1e6),
    n_random_timesteps: int = int(25e3),
    exploration_noise: float = 0.1,
    batch_size: int = 256,
    save_period: int = int(5e3),
    model_name: str = "policy",
    random_rewards: bool = False,
    timestamp: bool = False,
    random_start: bool = False,
    replications: Optional[str] = None,
    plot_episodes: bool = False,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
) -> None:
    logging.basicConfig(level=verbosity)

    if replications is not None:
        replication_indices = parse_replications(replications)

        if not random_rewards:
            reward_dir, reward_name = make_reward_path(reward_path)
            reward_paths = [reward_dir / str(i) / reward_name for i in replication_indices]
        else:
            reward_paths = [None for _ in replication_indices]

        Parallel(n_jobs=-2)(
            delayed(train)(
                reward_path=reward_path,
                outdir=Path(outdir) / str(i),
                actor_layers=actor_layers,
                critic_layers=critic_layers,
                dense=dense,
                n_timesteps=n_timesteps,
                n_random_timesteps=n_random_timesteps,
                exploration_noise=exploration_noise,
                batch_size=batch_size,
                save_period=save_period,
                model_name=model_name,
                timestamp=timestamp,
                random_start=random_start,
                random_rewards=random_rewards,
            )
            for i, reward_path in zip(replication_indices, reward_paths)
        )
        exit()

    outdir = make_outdir(outdir, timestamp)

    writer = SummaryWriter(log_dir=outdir)
    logging.basicConfig(filename=outdir / "log", level=verbosity)

    if not random_rewards:
        reward_weights = np.load(reward_path)
        env: LegacyEnv = gym.make(
            "LegacyDriver-v1", reward=reward_weights, random_start=random_start, time_in_state=True
        )
    else:
        env = gym.make("RandomLegacyDriver-v1", random_start=random_start, time_in_state=True)

    action_shape = env.action_space.sample().shape
    logging.info("Initialized env")

    if (outdir / (model_name + "_actor")).exists():
        td3 = load_td3(env=env, filename=outdir / model_name, writer=writer)
    else:
        td3 = make_td3(
            env,
            actor_kwargs={"layers": actor_layers, "dense": dense},
            critic_kwargs={"layers": critic_layers, "dense": dense},
            writer=writer,
            extra_state_dim=(random_rewards + use_reward_features) * len(env.reward),
        )
    buffer = ReplayBuffer(td3.state_dim + td3.extra_state_dim, td3.action_dim, writer=writer)
    logging.info("Initialized TD3 algorithm")

    raw_state = env.reset()
    logging.debug(f"raw_state={raw_state}")
    state = make_TD3_state(
        raw_state,
        reward_features=env.features(raw_state),
        reward_weights=env.reward_weights if random_rewards else None,
    )

    episode_reward_feautures = np.empty((env.HORIZON, *env.reward.shape))
    episode_actions = np.empty((env.HORIZON, *env.action_space.shape))
    best_return = float("-inf")
    for t in range(n_timesteps):
        action = pick_action(t, n_random_timesteps, env, td3, state, exploration_noise)
        assert action.shape == action_shape, f"Action shape={action.shape}, expected={action_shape}"
        next_raw_state, reward, done, info = env.step(action)
        log_step(next_raw_state, action, reward, info, log_iter=t, writer=writer)

        # Log episode features
        reward_features = info["reward_features"]
        if save_period - (t % save_period) <= env.HORIZON:
            episode_reward_feautures[t % env.HORIZON] = reward_features
            episode_actions[t % env.HORIZON] = action

        if done:
            assert t % env.HORIZON == env.HORIZON - 1, f"Done at t={t} when horizon={env.HORIZON}"
            next_raw_state = env.reset()
            next_state = make_TD3_state(
                raw_state,
                reward_features=env.features(next_raw_state),
                reward_weights=env.reward_weights if random_rewards else None,
            )
        else:
            next_state = make_TD3_state(
                next_raw_state,
                reward_features,
                reward_weights=info["reward_weights"] if random_rewards else None,
            )

        # Store data in replay buffer
        buffer.add(state, action, next_state, reward, done=float(done))

        state = next_state

        # Train agent after collecting sufficient data
        if t >= n_random_timesteps:
            td3.train(buffer, batch_size)

        if t % save_period == 0:
            logging.info(f"{t} / {n_timesteps}")

            if plot_episodes and t != 0:
                plot_heading(
                    heading=episode_reward_feautures[:, 2],
                    outdir=outdir / "plots" / "heading",
                    name=str(t // save_period),
                )
                plot_turn(
                    turn=episode_actions[:, 0],
                    outdir=outdir / "plots" / "turn",
                    name=str(t // save_period),
                )
            td3.save(str(outdir / model_name))

            logging.info("Evaluating the policy")
            # TODO(joschnei): If random_rewards, generate either a fixed or random-per-eval bag of
            # eval rewards and save the policy if it has better mean return over the bag of
            # eval-rewards. Otherwise just use the fixed reward.
            if random_rewards:
                raise NotImplementedError("Random rewards haven't been fully implemented yet.")
            eval_return = eval(
                reward_weights,
                td3=td3,
                writer=writer,
                log_iter=t // save_period,
            )
            if eval_return > best_return:
                best_return = eval_return
                td3.save(str(outdir / (f"best_{model_name}")))
예제 #9
0
def main(
    mistakes_path: Path,
    outdir: Path,
    plan_iters: int = 10,
    optim: Literal["sgd", "adam"] = "sgd",
    lr: float = 0.1,
    momentum: bool = False,
    nesterov: bool = False,
    extra_inits: bool = False,
    replications: Optional[str] = None,
    log_time: bool = False,
    log_best_inits: bool = False,
    n_traj_max: Optional[int] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    outdir = Path(outdir)
    experiment_dir = outdir / make_experiment(
        optim, lr, plan_iters, momentum, nesterov, extra_inits
    )
    experiment_dir.mkdir(parents=True, exist_ok=True)

    setup_logging(verbosity=verbosity, log_path=experiment_dir / "log.txt")

    if replications is not None:
        replication_indices = parse_replications(replications)
        mistakes_paths = [
            Path(mistakes_path) / str(index) / "planner_mistakes.pkl"
            for index in replication_indices
        ]
    else:
        mistakes_paths = [Path(mistakes_path)]

    if optim == "sgd":
        optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov)
    elif optim == "adam":
        optimizer = Adam(learning_rate=lr)

    env = LegacyEnv(reward=np.zeros(4))

    starts, rewards, better_trajs = collect_mistakes(
        mistakes_paths=mistakes_paths, n_max=n_traj_max
    )

    init_controls = (
        np.array(
            [
                [[0.0, 1.0]] * 50,
                [[0.0, -1.0]] * 50,
                [[-0.5, -1.0]] * 50,
                [[0.5, -1.0]] * 50,
                [[0.5, 1.0]] * 50,
                [[-0.5, 1.0]] * 50,
            ]
        )
        if extra_inits
        else None
    )

    logging.info("Making trajectories")
    opt_trajs, losses = make_opt_trajs(
        traj_opt=TrajOptimizer(
            n_planner_iters=plan_iters,
            optim=optimizer,
            init_controls=init_controls,
            log_best_init=log_best_inits,
        ),
        rewards=rewards,
        starts=starts,
        log_time=log_time,
    )

    logging.info("Rolling out trajectories")
    returns = np.empty((len(starts), 2))
    for i, (start, reward_weights, opt_traj, policy_traj, loss) in enumerate(
        zip(starts, rewards, opt_trajs, better_trajs, losses)
    ):
        env.reward = reward_weights

        traj_opt_return = rollout(actions=opt_traj, env=env, start=start)
        policy_return = rollout(actions=policy_traj, env=env, start=start)

        assert (
            abs(traj_opt_return + loss) < 0.001
        ), f"Rollout={traj_opt_return} and loss={loss}, differ by too much. start={start}, reward={reward_weights}"

        returns[i, 0] = traj_opt_return
        returns[i, 1] = policy_return

        logging.debug(
            f"Traj opt return={traj_opt_return}, loss={loss}, policy_return={policy_return}, delta={traj_opt_return-policy_return}"
        )

    np.save(experiment_dir / "returns.npy", returns)

    deltas = returns[:, 0] - returns[:, 1]

    logging.info(
        f"Mean delta={np.mean(deltas)}, mean better={np.mean(deltas > 0)*100:.1f}%, optim={optim}, lr={lr}, n={plan_iters}, momentum={momentum}, nesterov={nesterov}, extra inits={extra_inits}"
    )

    plot_returns(returns, experiment_dir)