def make_opt_trajs( traj_opt: TrajOptimizer, rewards: np.ndarray, starts: np.ndarray, log_time: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: trajs = [] losses = [] times = [] for reward, start_state in zip(rewards, starts): start = perf_counter() traj, loss = traj_opt.make_opt_traj(reward, start_state, return_loss=True) stop = perf_counter() trajs.append(traj) losses.append(loss) times.append(stop - start) trajs_array = np.array(trajs) assert len(trajs_array.shape) == 3 assert trajs_array.shape[1:] == (50, 2) if log_time: logging.info(f"Mean traj opt time={np.mean(times)}") return trajs_array, np.array(losses)
def align_worker( rewards: np.ndarray, states: np.ndarray, optim: TrajOptimizer, action_shape: Tuple[int, ...] = (2, ), ): batch_size = rewards.shape[0] assert states.shape[0] == batch_size plans = np.empty((batch_size, 50, *action_shape)) for i, (reward, state) in enumerate(zip(rewards, states)): traj, _ = optim.make_opt_traj(reward, state) plans[i] = traj.reshape(-1, *action_shape) return plans
def make_plans( rewards: np.ndarray, states: np.ndarray, optim: TrajOptimizer, parallel: Optional[Parallel] = None, action_shape: Tuple[int, ...] = (2, ), memorize: bool = False, ) -> np.ndarray: assert shape_compat( rewards, (-1, 4)), f"rewards shape={rewards.shape} is wrong, expected (-1, 4)" if parallel is not None: input_batches = np.array_split(list(product(rewards, states)), parallel.n_jobs) logging.debug("Branching") return np.concatenate( parallel( delayed(align_worker)( rewards=batch[:, 0], states=batch[:, 1], optim=optim, action_shape=action_shape, ) for batch in input_batches)).reshape(len(rewards), len(states), 50, *action_shape) else: plans = np.empty((len(rewards), len(states), 50, *action_shape)) for i, reward in enumerate(rewards): assert reward.shape == (4, ) for j, state in enumerate(states): traj, _ = optim.make_opt_traj(reward, state, memorize=memorize) plans[i, j] = traj.reshape(-1, *action_shape) return plans
def make_test_rewards( epsilons: Sequence[float], true_reward: np.ndarray, n_rewards: int, outdir: Path, parallel: Parallel, n_test_states: Optional[int] = None, traj_opt: bool = False, max_attempts: int = 10, n_gt_test_questions: Optional[int] = None, use_equiv: bool = False, overwrite: bool = False, ) -> Dict[float, Tuple[np.ndarray, np.ndarray]]: """ Makes test rewards sets for every epsilon and saves them to a file. """ traj_optimizer = (TrajOptimizer(n_planner_iters=100, optim=tf.keras.optimizers.Adam(0.2)) if traj_opt else None) reward_path = outdir / "test_rewards.pkl" test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = load(reward_path, overwrite=overwrite) if test_rewards is None: test_rewards = {} else: logging.info(f"Loading test rewards from {reward_path}") new_epsilons = set(epsilons) - test_rewards.keys() if len(new_epsilons) > 0: logging.info(f"Creating new test rewards for epsilons: {new_epsilons}") if (n_test_states is not None and n_test_states > 1) or len(new_epsilons) == 1: # Parallelize internally test_rewards.update({ epsilon: find_reward_boundary( true_reward=true_reward, traj_optimizer=traj_optimizer, n_rewards=n_rewards, use_equiv=use_equiv, epsilon=epsilon, n_test_states=n_test_states, max_attempts=max_attempts, outdir=outdir, n_gt_test_questions=n_gt_test_questions, overwrite=overwrite, parallel=parallel, )[:2] for epsilon in new_epsilons }) else: for rewards, alignment, epsilon in parallel( delayed(find_reward_boundary)( true_reward=true_reward, traj_optimizer=traj_optimizer, n_rewards=n_rewards, use_equiv=use_equiv, epsilon=epsilon, n_test_states=n_test_states, max_attempts=max_attempts, n_gt_test_questions=n_gt_test_questions, outdir=outdir, overwrite=overwrite, parallel=None, ) for epsilon in new_epsilons): test_rewards[epsilon] = (rewards, alignment) logging.info(f"Writing generated test rewards to {reward_path}") pkl.dump(test_rewards, open(reward_path, "wb")) return test_rewards
def compare_test_labels( test_rewards_path: Path, true_reward_path: Path, traj_opt: bool = False, elicitation: bool = False, replications: Optional[str] = None, normals_path: Optional[Path] = None, ): if replications is not None: raise NotImplementedError("Replications not yet implemented") starting_tests: Dict[float, Tuple[np.ndarray, np.ndarray]] = pkl.load( open(test_rewards_path, "rb")) assert not (traj_opt == elicitation ), "Provided labels must come from exactly one source" class Test(NamedTuple): rewards: np.ndarray q_labels: np.ndarray elicitation_labels: np.ndarray test_rewards: Dict[float, Test] = {} true_reward = np.load(true_reward_path) if traj_opt: normals = np.load(normals_path) for epsilon, (rewards, q_labels) in starting_tests.items(): normals = normals[true_reward @ normals.T > epsilon] elicitation_labels = run_test(normals, rewards, use_equiv=False) test_rewards[epsilon] = Test(rewards=rewards, q_labels=q_labels, elicitation_labels=elicitation_labels) elif elicitation: parallel = Parallel(n_cpus=-4) env = LegacyEnv(reward=true_reward, random_start=True) traj_optimizer = TrajOptimizer(10) for epsilon, (rewards, elicitation_labels) in starting_tests.items(): q_labels = rewards_aligned( traj_optimizer=traj_optimizer, env=env, true_reward=true_reward, test_rewards=rewards, epsilon=epsilon, parallel=parallel, ) test_rewards[epsilon] = Test(rewards=rewards, q_labels=q_labels, elicitation_labels=elicitation_labels) total_agree = 0 total_rewards = 0 for epsilon, test in test_rewards.items(): total_agree += np.sum(test.q_labels == test.elicitation_labels) total_rewards += len(test.rewards) print( f"Critic and superset labels agree on {total_agree / total_rewards * 100 :.1f}% of rewards" )
def compare( reward_path: Path, td3_dir: Path, outdir: Path, planner_iters: int = 10, random_start: bool = False, n_starts: int = 1, replications: Optional[str] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) td3_paths = make_td3_paths(Path(td3_dir), replication_indices) for replication, td3_path in zip(replication_indices, td3_paths): compare( reward_path=Path(reward_path) / str(replication) / "true_reward.npy", outdir=Path(outdir) / str(replication), td3_dir=td3_path, planner_iters=planner_iters, random_start=random_start, n_starts=n_starts, verbosity=verbosity, ) exit() reward_weights: np.ndarray = np.load(reward_path).astype(np.float32) env = gym.make("LegacyDriver-v1", reward=reward_weights, random_start=random_start) td3 = load_td3(env, td3_dir) traj_optimizer = TrajOptimizer(planner_iters) class BadPlannerCollection: def __init__(self): self.states = None self.rewards = None self.trajs = None def append( self, state: np.ndarray, reward: np.ndarray, traj: np.ndarray ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: if self.states is None: self.states = np.array([state]) logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}") self.rewards = np.array([reward]) self.trajs = np.array([traj]) else: self.states = np.append(self.states, [state], axis=0) logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}") self.rewards = np.append(self.rewards, [reward], axis=0) self.trajs = np.append(self.trajs, [traj], axis=0) self.check_shapes() return self.get() def check_shapes(self): assert len(self.states.shape) == 3 assert len(self.rewards.shape) == 2 assert len(self.trajs.shape) == 3 assert self.states.shape[1:] == (2, 4) assert self.rewards.shape[1] == 4 assert self.trajs.shape[1:] == (50, 2) def get(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: return self.states, self.rewards, self.trajs planner_bad = BadPlannerCollection() returns = np.empty((n_starts, 2)) for i in range(n_starts): logging.info(f"{i+1}/{n_starts}") start_state: np.ndarray = env.reset() logging.info("Optimizing traj") opt_traj = traj_optimizer.make_opt_traj(reward_weights, start_state) logging.info("Executing traj") opt_return = 0.0 for action in opt_traj: state, reward, done, info = env.step(action) opt_return += reward opt_return = opt_return / len(opt_traj) logging.info("Evaluating policy") empirical_return, traj = eval( reward_weights=reward_weights, td3=td3, start_state=start_state, time_in_state=False, return_actions=True, ) returns[i] = empirical_return, opt_return if opt_return < empirical_return: planner_bad.append(start_state, reward_weights, traj) outdir.mkdir(parents=True, exist_ok=True) plot_dir = outdir / "comparison_plots" plot_dir.mkdir(parents=True, exist_ok=True) plt.hist(returns[:, 0], label="Empirical", alpha=0.5) plt.hist(returns[:, 1], label="Optimal", alpha=0.5) plt.title("Histogram of Optimal vs Empirical returns") plt.legend() plt.savefig(plot_dir / "returns.png") plt.close() regret = returns[:, 1] - returns[:, 0] plt.hist(regret) plt.title("Histogram of regret") plt.savefig(plot_dir / "regret.png") plt.close() logging.info(f"Average regret = {np.mean(regret)}, min={np.min(regret)}, max={np.max(regret)}") pickle.dump(planner_bad.get(), (outdir / "planner_mistakes.pkl").open("wb"))
def main( mistakes_path: Path, outdir: Path, plan_iters: int = 10, optim: Literal["sgd", "adam"] = "sgd", lr: float = 0.1, momentum: bool = False, nesterov: bool = False, extra_inits: bool = False, replications: Optional[str] = None, log_time: bool = False, log_best_inits: bool = False, n_traj_max: Optional[int] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): outdir = Path(outdir) experiment_dir = outdir / make_experiment( optim, lr, plan_iters, momentum, nesterov, extra_inits ) experiment_dir.mkdir(parents=True, exist_ok=True) setup_logging(verbosity=verbosity, log_path=experiment_dir / "log.txt") if replications is not None: replication_indices = parse_replications(replications) mistakes_paths = [ Path(mistakes_path) / str(index) / "planner_mistakes.pkl" for index in replication_indices ] else: mistakes_paths = [Path(mistakes_path)] if optim == "sgd": optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov) elif optim == "adam": optimizer = Adam(learning_rate=lr) env = LegacyEnv(reward=np.zeros(4)) starts, rewards, better_trajs = collect_mistakes( mistakes_paths=mistakes_paths, n_max=n_traj_max ) init_controls = ( np.array( [ [[0.0, 1.0]] * 50, [[0.0, -1.0]] * 50, [[-0.5, -1.0]] * 50, [[0.5, -1.0]] * 50, [[0.5, 1.0]] * 50, [[-0.5, 1.0]] * 50, ] ) if extra_inits else None ) logging.info("Making trajectories") opt_trajs, losses = make_opt_trajs( traj_opt=TrajOptimizer( n_planner_iters=plan_iters, optim=optimizer, init_controls=init_controls, log_best_init=log_best_inits, ), rewards=rewards, starts=starts, log_time=log_time, ) logging.info("Rolling out trajectories") returns = np.empty((len(starts), 2)) for i, (start, reward_weights, opt_traj, policy_traj, loss) in enumerate( zip(starts, rewards, opt_trajs, better_trajs, losses) ): env.reward = reward_weights traj_opt_return = rollout(actions=opt_traj, env=env, start=start) policy_return = rollout(actions=policy_traj, env=env, start=start) assert ( abs(traj_opt_return + loss) < 0.001 ), f"Rollout={traj_opt_return} and loss={loss}, differ by too much. start={start}, reward={reward_weights}" returns[i, 0] = traj_opt_return returns[i, 1] = policy_return logging.debug( f"Traj opt return={traj_opt_return}, loss={loss}, policy_return={policy_return}, delta={traj_opt_return-policy_return}" ) np.save(experiment_dir / "returns.npy", returns) deltas = returns[:, 0] - returns[:, 1] logging.info( f"Mean delta={np.mean(deltas)}, mean better={np.mean(deltas > 0)*100:.1f}%, optim={optim}, lr={lr}, n={plan_iters}, momentum={momentum}, nesterov={nesterov}, extra inits={extra_inits}" ) plot_returns(returns, experiment_dir)