def convert_legacy_td3(filename: Union[str, Path], replications: Optional[str] = None) -> None: # We used to include reward features as part of state, but now they're extra dimensions. This # function converts from the old encoding to the new one. if replications is not None: replication_indices = parse_replications(replications) td3_paths = make_td3_paths(Path(filename), replication_indices) for td3_path in td3_paths: convert_legacy_td3(td3_path) exit() # We can't do this if we're adding more dimensions. We just have car state + reward features explicit_args = pickle.load(open(str(filename) + ".meta.pkl", "rb"), fix_imports=True) assert "extra_state_dim" not in explicit_args.keys() env = gym.make("LegacyDriver-v1", reward=np.zeros(4)) state_dim = np.prod(env.observation_space.shape) action_dim = np.prod(env.action_space.shape) explicit_args["extra_state_dim"] = 4 max_action = max(np.max(env.action_space.high), -np.min(env.action_space.low)) td3 = Td3( state_dim=state_dim, action_dim=action_dim, max_action=max_action, **explicit_args, ) td3.load(str(filename)) pickle.dump(explicit_args, open(str(filename) + ".meta.pkl", "wb"))
def refine( reward_path: Path, td3_dir: Path, env_iters: int = int(1e5), batch_size: int = 100, replications: Optional[Union[str, Tuple[int, ...]]] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): logging.basicConfig(level=verbosity) if replications is not None: replication_indices = parse_replications(replications) reward_dir, reward_name = make_reward_path(reward_path) td3_paths = make_td3_paths(Path(td3_dir), replication_indices) Parallel(n_jobs=-2)( delayed(refine)( reward_path=reward_dir / str(i) / reward_name, td3_dir=td3_path, env_iters=env_iters, batch_size=batch_size, ) for i, td3_path in zip(replication_indices, td3_paths) ) exit() td3_dir = Path(td3_dir) writer = SummaryWriter(log_dir=td3_dir.parent, filename_suffix="refine") reward_weights = np.load(reward_path) env = gym.make("LegacyDriver-v1", reward=reward_weights) td3 = load_td3(env, td3_dir, writer=writer) buffer = ReplayBuffer(td3.state_dim, td3.action_dim, writer=writer) logging.info("Initialized TD3 algorithm") raw_state = env.reset() state = make_TD3_state(raw_state, reward_features=env.features(raw_state)) for t in range(env_iters): action = td3.select_action(state) next_raw_state, reward, done, info = env.step(action) log_step(next_raw_state, action, reward, info, log_iter=t, writer=writer) reward_features = info["reward_features"] if done: assert t % env.HORIZON == env.HORIZON - 1, f"Done at t={t} when horizon={env.HORIZON}" next_raw_state = env.reset() next_state = make_TD3_state( next_raw_state, reward_features=env.main_car.features(next_raw_state, None).numpy() ) else: next_state = make_TD3_state(next_raw_state, reward_features) # Store data in replay buffer buffer.add(state, action, next_state, reward, done=float(done)) state = next_state td3.update_critic(*buffer.sample(batch_size)) td3.save(str(td3_dir) + "_refined")
def premake_test_rewards( epsilons: List[float] = [0.0], n_rewards: int = 100, n_test_states: Optional[int] = None, n_gt_test_questions: int = 10000, true_reward_name: Path = Path("true_reward.npy"), datadir: Path = Path(), outdir: Path = Path(), replications: Optional[Union[str, Tuple[int, ...]]] = None, n_cpus: int = 1, overwrite: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): """ Finds test rewards for each experiment. """ outdir.mkdir(parents=True, exist_ok=True) # TODO(joschnei): I'm making some dangerous logging decisions. Do I want to append to logs, or # give logs unique names? I really need to pick at least one. setup_logging(verbosity, log_path=outdir / "log.txt") if replications is not None: replication_indices = parse_replications(replications) for replication in replication_indices: if not (datadir / str(replication)).exists(): logging.warning( f"Replication {replication} does not exist, skipping") continue premake_test_rewards( epsilons=epsilons, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=n_gt_test_questions, true_reward_name=true_reward_name, datadir=datadir / str(replication), outdir=outdir / str(replication), use_equiv=use_equiv, n_cpus=n_cpus, overwrite=overwrite, verbosity=verbosity, ) logging.info(f"Done with replication {replication}") exit() true_reward = np.load(datadir / true_reward_name) assert_reward(true_reward, False, 4) with Parallel(n_jobs=n_cpus) as parallel: make_test_rewards( epsilons=epsilons, true_reward=true_reward, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=int(n_gt_test_questions), outdir=outdir, parallel=parallel, use_equiv=use_equiv, overwrite=overwrite, )
def simulated( epsilons: List[float] = [0.0], n_rewards: int = 100, human_samples: List[int] = [1], n_reward_samples: int = 1000, n_test_states: Optional[int] = None, n_gt_test_questions: int = 10000, traj_opt: bool = False, datadir: Path = Path(), outdir: Path = Path(), deltas: List[Optional[float]] = [None], use_mean_reward: bool = False, use_random_test_questions: bool = False, n_random_test_questions: Optional[int] = None, use_cheating_questions: bool = False, skip_remove_duplicates: bool = False, skip_epsilon_filtering: bool = False, skip_redundancy_filtering: bool = False, use_true_epsilon: bool = False, legacy_test_rewards: bool = False, replications: Optional[Union[str, Tuple[int, ...]]] = None, n_cpus: int = 1, overwrite_test_rewards: bool = False, overwrite_results: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: """ Evaluates alignment test generated by ground-truth rewards. """ logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) for replication in replication_indices: if not (datadir / str(replication)).exists(): logging.warning( f"Replication {replication} does not exist, skipping") continue logging.info(f"Starting replication {replication}") simulated( epsilons=epsilons, deltas=deltas, n_rewards=n_rewards, human_samples=human_samples, n_reward_samples=n_reward_samples, n_test_states=n_test_states, n_gt_test_questions=n_gt_test_questions, datadir=datadir / str(replication), outdir=outdir / str(replication), use_mean_reward=use_mean_reward, use_random_test_questions=use_random_test_questions, use_cheating_questions=use_cheating_questions, n_random_test_questions=n_random_test_questions, skip_remove_duplicates=skip_remove_duplicates, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, legacy_test_rewards=legacy_test_rewards, n_cpus=n_cpus, overwrite_test_rewards=overwrite_test_rewards, overwrite_results=overwrite_results, verbosity=verbosity, ) exit() logging.info(f"Using {n_cpus} cpus.") parallel = Parallel(n_jobs=n_cpus) outdir.mkdir(parents=True, exist_ok=True) if n_random_test_questions is not None: # Argh defaults to parsing something as a string if its optional n_random_test_questions = int(n_random_test_questions) flags = pkl.load(open(datadir / flags_name, "rb")) query_type = flags["query_type"] equiv_probability = flags["equiv_size"] env = Driver() n_reward_features = env.num_of_features logging.info("Loading elicitation results") elicited_normals, elicited_preferences, elicited_input_features = load_elicitation( datadir=datadir, normals_name=normals_name, preferences_name=preferences_name, input_features_name=input_features_name, n_reward_features=n_reward_features, use_equiv=use_equiv, query_type=query_type, equiv_probability=equiv_probability, ) true_reward = np.load(datadir / true_reward_name) assert_reward(true_reward, False, n_reward_features) if use_equiv: true_reward = np.append(true_reward, [1]) else: assert not np.any(elicited_preferences == 0) factory = TestFactory( query_type=query_type, reward_dimension=elicited_normals.shape[1], equiv_probability=equiv_probability, n_reward_samples=n_reward_samples, use_mean_reward=use_mean_reward, skip_dedup=skip_remove_duplicates, skip_noise_filtering=True, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, true_reward=true_reward, ) logging.info(f"""Filtering settings: # reward samples={n_reward_samples}, use mean reward={use_mean_reward}, skip duplicates={skip_remove_duplicates} skip noise={True} skip epsilon={skip_epsilon_filtering} skip redundancy={skip_redundancy_filtering} use true epsilon={use_true_epsilon} """) confusion_path, test_path = make_outnames( outdir, skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, ) confusions: Dict[Experiment, np.ndarray] = load(confusion_path, overwrite_results, default={}) minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite_results, default={}) experiments = make_experiments(epsilons, deltas, human_samples, overwrite_results, experiments=set(minimal_tests.keys())) if use_random_test_questions: logging.info("Making random test") logging.info(f"True reward: {true_reward}") normals, preferences, input_features = make_random_test( n_random_test_questions, elicited_input_features, elicited_preferences, reward_iterations=flags["reward_iterations"], query_type=query_type, equiv_size=flags["equiv_size"], sim=env, use_equiv=use_equiv, ) good_indices = (true_reward @ normals.T) > 0 logging.info( f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward." ) if use_cheating_questions: logging.info(f"Selecting only questions consistent with gt reward") normals = normals[good_indices] preferences = preferences[good_indices] input_features = input_features[good_indices] assert_normals(normals, use_equiv) else: max_n = max(human_samples) preferences = elicited_preferences[:max_n] input_features = elicited_input_features[:max_n] logging.debug(f"elicited_normals={elicited_normals[:10]}") normals = orient_normals(elicited_normals[:max_n], preferences, use_equiv, n_reward_features) logging.debug(f"normals={normals[:10]}") assert np.all(true_reward @ normals.T >= 0) if not legacy_test_rewards: test_rewards = make_test_rewards( epsilons=epsilons, true_reward=true_reward, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=int(n_gt_test_questions), traj_opt=traj_opt, outdir=outdir, parallel=parallel, use_equiv=use_equiv, overwrite=overwrite_test_rewards, ) else: test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward, epsilons, use_equiv) for indices, confusion, experiment in parallel( delayed(run_gt_experiment)( normals=normals, test_rewards=test_rewards[epsilon][0], test_reward_alignment=test_rewards[epsilon][1], epsilon=epsilon, delta=delta, use_equiv=use_equiv, n_human_samples=n, factory=factory, input_features=input_features, preferences=preferences, outdir=outdir, verbosity=verbosity, ) for epsilon, delta, n in experiments): minimal_tests[experiment] = indices confusions[experiment] = confusion pkl.dump(confusions, open(confusion_path, "wb")) pkl.dump(minimal_tests, open(test_path, "wb"))
def simulated( outdir: Path, criterion: Literal["information", "volume", "random"], termination_threshold: float, n_reward_samples: int, query_type: Literal["strict", "weak"] = "strict", equiv_size: Optional[float] = None, true_reward_path: Optional[Path] = None, continuous: bool = False, overwrite: bool = False, replicaitons: Optional[str] = None, ): """ Generates a test by eliciting from a human simulated by a ground truth reward. """ if replicaitons is not None: replication_indices = parse_replications(replicaitons) if true_reward_path is not None: reward_dir, reward_name = make_reward_path(true_reward_path) Parallel(n_jobs=-2)( delayed(simulated)( outdir=Path(outdir) / str(i), criterion=criterion, termination_threshold=termination_threshold, n_reward_smaples=n_reward_samples, query_type=query_type, equiv_size=equiv_size, true_reward_path=reward_dir / str(i) / reward_name, continuous=continuous, overwrite=overwrite, ) for i in replication_indices ) else: Parallel(n_jobs=-2)( delayed(simulated)( outdir=Path(outdir) / str(i), criterion=criterion, termination_threshold=termination_threshold, n_reward_smaples=n_reward_samples, query_type=query_type, equiv_size=equiv_size, continuous=continuous, overwrite=overwrite, ) for i in replication_indices ) exit() criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size) env = Driver() d = env.num_of_features if true_reward_path is not None: logging.info(f"Loading true reward from {true_reward_path}") true_reward = np.load(true_reward_path) else: logging.info("Randomly generating true reward") true_reward = np.random.normal(size=(4,)) true_reward = true_reward / np.linalg.norm(true_reward) np.save(outdir / "true_reward.npy", true_reward) pickle.dump( { "criterion": criterion, "reward_iterations": n_reward_samples, "stop_thresh": termination_threshold, "query_type": query_type, "equiv_size": equiv_size, "continuous": continuous, }, open(outdir / "flags.pkl", "wb"), ) normals = load(outdir / "normals.npy", overwrite=overwrite) preferences = load(outdir / "preferences.npy", overwrite=overwrite) inputs = load(outdir / "inputs.npy", overwrite=overwrite) input_features = load(outdir / "input_features.npy", overwrite=overwrite) # If there is already data, feed it to the w_sampler to get the right posterior. w_sampler = Sampler(d) if inputs is not None and input_features is not None and preferences is not None: for (a_phi, b_phi), preference in zip(input_features, preferences): w_sampler.feed(a_phi, b_phi, [preference]) score = np.inf try: while score >= termination_threshold: w_samples, delta_samples = w_sampler.sample_given_delta( sample_count=n_reward_samples, query_type=query_type, delta=equiv_size ) input_A, input_B, score = run_algo(criterion, env, w_samples, delta_samples, continuous) logging.info(f"Score={score}") if score > termination_threshold: inputs = update_inputs( a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir ) phi_A, phi_B, preference = get_simulated_feedback( simulation=env, input_A=input_A, input_B=input_B, query_type=query_type, true_reward=true_reward, delta=equiv_size, ) input_features = append(input_features, np.stack([phi_A, phi_B])) normals = append(normals, phi_A - phi_B) preferences = append(preferences, preference) np.save(outdir / "input_features.npy", input_features) np.save(outdir / "normals.npy", normals) np.save(outdir / "preferences.npy", preferences) w_sampler.feed(phi_A, phi_B, [preference]) except KeyboardInterrupt: # Pass through to finally logging.warning("\nSaving results, please do not exit again.") finally: save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def main( n_questions: int, query_type: Literal["strict", "weak"] = "strict", equiv_size: float = 1.1, reward_iterations: int = 100, outdir: Path = Path("data/simulated/random/elicitation"), human: bool = False, reward_path: Optional[Path] = None, replications: Optional[str] = None, overwrite: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: outpath = Path(outdir) outpath.mkdir(parents=True, exist_ok=True) setup_logging(verbosity=verbosity, log_path=outpath / "log.txt") if not human: assert reward_path is not None reward_dir, reward_name = make_reward_path(reward_path) reward_path = reward_dir / reward_name if replications is not None: replication_indices = parse_replications(replications) n_cpus = min(multiprocessing.cpu_count() - 4, len(replication_indices)) Parallel(n_jobs=n_cpus)( delayed(main)( n_questions=n_questions, query_type=query_type, equiv_size=equiv_size, reward_iterations=reward_iterations, outdir=outpath / str(i), human=human, reward_path=reward_dir / str(i) / reward_name, overwrite=overwrite, verbosity=verbosity, ) for i in replication_indices ) exit() if not human: assert reward_path is not None if not reward_path.exists(): logging.warning("Reward path given does not exist, generating random reward.") true_reward = np.random.default_rng().normal(loc=0, scale=1, size=(4,)) true_reward = safe_normalize(true_reward) np.save(reward_path, true_reward) else: true_reward = np.load(reward_path) pickle.dump( { "n_questions": n_questions, "query_type": query_type, "equiv_size": equiv_size, "reward_iterations": reward_iterations, "human": human, }, open(outpath / "flags.pkl", "wb"), ) normals = load(outpath / "normals.npy", overwrite=overwrite) preferences = load(outpath / "preferences.npy", overwrite=overwrite) # TODO(joschnei): Make class for inputs, dimensions are too difficult to reason about # (N, 2, 100) inputs = load(outpath / "inputs.npy", overwrite=overwrite) input_features = load(outpath / "input_features.npy", overwrite=overwrite) env = Driver() if ( inputs is not None and input_features is not None and inputs.shape[0] > input_features.shape[0] ): logging.info("Catching up to previously generated trajectories.") input_A, input_B = inputs[-1] if human: phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type) else: phi_A, phi_B, preference = get_simulated_feedback( env, input_A, input_B, query_type, true_reward, equiv_size ) input_features, normals, preferences = update_response( input_features, normals, preferences, phi_A, phi_B, preference, outpath ) # Questions and inputs are duplicated, but this keeps everything consistent for the hot-load case new_questions = n_questions - inputs.shape[0] if inputs is not None else n_questions questions = make_random_questions(n_questions=new_questions, env=env) logging.debug(f"questions={questions[:10]}") if inputs is not None: assert input_features is not None assert normals is not None assert preferences is not None assert inputs.shape[0] == input_features.shape[0] assert inputs.shape[0] == normals.shape[0] assert inputs.shape[0] == preferences.shape[0] for input_A, input_B in questions: inputs = update_inputs(input_A, input_B, inputs, outpath) if inputs.shape[0] % 10 == 0: logging.info(f"{inputs.shape[0]} of {n_questions}") if human: phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type) else: phi_A, phi_B, preference = get_simulated_feedback( env, input_A, input_B, query_type, true_reward, equiv_size ) input_features, normals, preferences = update_response( input_features, normals, preferences, phi_A, phi_B, preference, outpath ) save_reward( query_type=query_type, true_delta=equiv_size, w_sampler=Sampler(env.num_of_features), n_reward_samples=reward_iterations, outdir=outpath, )
def compare( reward_path: Path, td3_dir: Path, outdir: Path, planner_iters: int = 10, random_start: bool = False, n_starts: int = 1, replications: Optional[str] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) td3_paths = make_td3_paths(Path(td3_dir), replication_indices) for replication, td3_path in zip(replication_indices, td3_paths): compare( reward_path=Path(reward_path) / str(replication) / "true_reward.npy", outdir=Path(outdir) / str(replication), td3_dir=td3_path, planner_iters=planner_iters, random_start=random_start, n_starts=n_starts, verbosity=verbosity, ) exit() reward_weights: np.ndarray = np.load(reward_path).astype(np.float32) env = gym.make("LegacyDriver-v1", reward=reward_weights, random_start=random_start) td3 = load_td3(env, td3_dir) traj_optimizer = TrajOptimizer(planner_iters) class BadPlannerCollection: def __init__(self): self.states = None self.rewards = None self.trajs = None def append( self, state: np.ndarray, reward: np.ndarray, traj: np.ndarray ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: if self.states is None: self.states = np.array([state]) logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}") self.rewards = np.array([reward]) self.trajs = np.array([traj]) else: self.states = np.append(self.states, [state], axis=0) logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}") self.rewards = np.append(self.rewards, [reward], axis=0) self.trajs = np.append(self.trajs, [traj], axis=0) self.check_shapes() return self.get() def check_shapes(self): assert len(self.states.shape) == 3 assert len(self.rewards.shape) == 2 assert len(self.trajs.shape) == 3 assert self.states.shape[1:] == (2, 4) assert self.rewards.shape[1] == 4 assert self.trajs.shape[1:] == (50, 2) def get(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: return self.states, self.rewards, self.trajs planner_bad = BadPlannerCollection() returns = np.empty((n_starts, 2)) for i in range(n_starts): logging.info(f"{i+1}/{n_starts}") start_state: np.ndarray = env.reset() logging.info("Optimizing traj") opt_traj = traj_optimizer.make_opt_traj(reward_weights, start_state) logging.info("Executing traj") opt_return = 0.0 for action in opt_traj: state, reward, done, info = env.step(action) opt_return += reward opt_return = opt_return / len(opt_traj) logging.info("Evaluating policy") empirical_return, traj = eval( reward_weights=reward_weights, td3=td3, start_state=start_state, time_in_state=False, return_actions=True, ) returns[i] = empirical_return, opt_return if opt_return < empirical_return: planner_bad.append(start_state, reward_weights, traj) outdir.mkdir(parents=True, exist_ok=True) plot_dir = outdir / "comparison_plots" plot_dir.mkdir(parents=True, exist_ok=True) plt.hist(returns[:, 0], label="Empirical", alpha=0.5) plt.hist(returns[:, 1], label="Optimal", alpha=0.5) plt.title("Histogram of Optimal vs Empirical returns") plt.legend() plt.savefig(plot_dir / "returns.png") plt.close() regret = returns[:, 1] - returns[:, 0] plt.hist(regret) plt.title("Histogram of regret") plt.savefig(plot_dir / "regret.png") plt.close() logging.info(f"Average regret = {np.mean(regret)}, min={np.min(regret)}, max={np.max(regret)}") pickle.dump(planner_bad.get(), (outdir / "planner_mistakes.pkl").open("wb"))
def train( outdir: Path, reward_path: Path = Path(), actor_layers: List[int] = [256, 256], critic_layers: List[int] = [256, 256], dense: bool = False, use_reward_features: bool = True, n_timesteps: int = int(1e6), n_random_timesteps: int = int(25e3), exploration_noise: float = 0.1, batch_size: int = 256, save_period: int = int(5e3), model_name: str = "policy", random_rewards: bool = False, timestamp: bool = False, random_start: bool = False, replications: Optional[str] = None, plot_episodes: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: logging.basicConfig(level=verbosity) if replications is not None: replication_indices = parse_replications(replications) if not random_rewards: reward_dir, reward_name = make_reward_path(reward_path) reward_paths = [reward_dir / str(i) / reward_name for i in replication_indices] else: reward_paths = [None for _ in replication_indices] Parallel(n_jobs=-2)( delayed(train)( reward_path=reward_path, outdir=Path(outdir) / str(i), actor_layers=actor_layers, critic_layers=critic_layers, dense=dense, n_timesteps=n_timesteps, n_random_timesteps=n_random_timesteps, exploration_noise=exploration_noise, batch_size=batch_size, save_period=save_period, model_name=model_name, timestamp=timestamp, random_start=random_start, random_rewards=random_rewards, ) for i, reward_path in zip(replication_indices, reward_paths) ) exit() outdir = make_outdir(outdir, timestamp) writer = SummaryWriter(log_dir=outdir) logging.basicConfig(filename=outdir / "log", level=verbosity) if not random_rewards: reward_weights = np.load(reward_path) env: LegacyEnv = gym.make( "LegacyDriver-v1", reward=reward_weights, random_start=random_start, time_in_state=True ) else: env = gym.make("RandomLegacyDriver-v1", random_start=random_start, time_in_state=True) action_shape = env.action_space.sample().shape logging.info("Initialized env") if (outdir / (model_name + "_actor")).exists(): td3 = load_td3(env=env, filename=outdir / model_name, writer=writer) else: td3 = make_td3( env, actor_kwargs={"layers": actor_layers, "dense": dense}, critic_kwargs={"layers": critic_layers, "dense": dense}, writer=writer, extra_state_dim=(random_rewards + use_reward_features) * len(env.reward), ) buffer = ReplayBuffer(td3.state_dim + td3.extra_state_dim, td3.action_dim, writer=writer) logging.info("Initialized TD3 algorithm") raw_state = env.reset() logging.debug(f"raw_state={raw_state}") state = make_TD3_state( raw_state, reward_features=env.features(raw_state), reward_weights=env.reward_weights if random_rewards else None, ) episode_reward_feautures = np.empty((env.HORIZON, *env.reward.shape)) episode_actions = np.empty((env.HORIZON, *env.action_space.shape)) best_return = float("-inf") for t in range(n_timesteps): action = pick_action(t, n_random_timesteps, env, td3, state, exploration_noise) assert action.shape == action_shape, f"Action shape={action.shape}, expected={action_shape}" next_raw_state, reward, done, info = env.step(action) log_step(next_raw_state, action, reward, info, log_iter=t, writer=writer) # Log episode features reward_features = info["reward_features"] if save_period - (t % save_period) <= env.HORIZON: episode_reward_feautures[t % env.HORIZON] = reward_features episode_actions[t % env.HORIZON] = action if done: assert t % env.HORIZON == env.HORIZON - 1, f"Done at t={t} when horizon={env.HORIZON}" next_raw_state = env.reset() next_state = make_TD3_state( raw_state, reward_features=env.features(next_raw_state), reward_weights=env.reward_weights if random_rewards else None, ) else: next_state = make_TD3_state( next_raw_state, reward_features, reward_weights=info["reward_weights"] if random_rewards else None, ) # Store data in replay buffer buffer.add(state, action, next_state, reward, done=float(done)) state = next_state # Train agent after collecting sufficient data if t >= n_random_timesteps: td3.train(buffer, batch_size) if t % save_period == 0: logging.info(f"{t} / {n_timesteps}") if plot_episodes and t != 0: plot_heading( heading=episode_reward_feautures[:, 2], outdir=outdir / "plots" / "heading", name=str(t // save_period), ) plot_turn( turn=episode_actions[:, 0], outdir=outdir / "plots" / "turn", name=str(t // save_period), ) td3.save(str(outdir / model_name)) logging.info("Evaluating the policy") # TODO(joschnei): If random_rewards, generate either a fixed or random-per-eval bag of # eval rewards and save the policy if it has better mean return over the bag of # eval-rewards. Otherwise just use the fixed reward. if random_rewards: raise NotImplementedError("Random rewards haven't been fully implemented yet.") eval_return = eval( reward_weights, td3=td3, writer=writer, log_iter=t // save_period, ) if eval_return > best_return: best_return = eval_return td3.save(str(outdir / (f"best_{model_name}")))
def main( mistakes_path: Path, outdir: Path, plan_iters: int = 10, optim: Literal["sgd", "adam"] = "sgd", lr: float = 0.1, momentum: bool = False, nesterov: bool = False, extra_inits: bool = False, replications: Optional[str] = None, log_time: bool = False, log_best_inits: bool = False, n_traj_max: Optional[int] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): outdir = Path(outdir) experiment_dir = outdir / make_experiment( optim, lr, plan_iters, momentum, nesterov, extra_inits ) experiment_dir.mkdir(parents=True, exist_ok=True) setup_logging(verbosity=verbosity, log_path=experiment_dir / "log.txt") if replications is not None: replication_indices = parse_replications(replications) mistakes_paths = [ Path(mistakes_path) / str(index) / "planner_mistakes.pkl" for index in replication_indices ] else: mistakes_paths = [Path(mistakes_path)] if optim == "sgd": optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov) elif optim == "adam": optimizer = Adam(learning_rate=lr) env = LegacyEnv(reward=np.zeros(4)) starts, rewards, better_trajs = collect_mistakes( mistakes_paths=mistakes_paths, n_max=n_traj_max ) init_controls = ( np.array( [ [[0.0, 1.0]] * 50, [[0.0, -1.0]] * 50, [[-0.5, -1.0]] * 50, [[0.5, -1.0]] * 50, [[0.5, 1.0]] * 50, [[-0.5, 1.0]] * 50, ] ) if extra_inits else None ) logging.info("Making trajectories") opt_trajs, losses = make_opt_trajs( traj_opt=TrajOptimizer( n_planner_iters=plan_iters, optim=optimizer, init_controls=init_controls, log_best_init=log_best_inits, ), rewards=rewards, starts=starts, log_time=log_time, ) logging.info("Rolling out trajectories") returns = np.empty((len(starts), 2)) for i, (start, reward_weights, opt_traj, policy_traj, loss) in enumerate( zip(starts, rewards, opt_trajs, better_trajs, losses) ): env.reward = reward_weights traj_opt_return = rollout(actions=opt_traj, env=env, start=start) policy_return = rollout(actions=policy_traj, env=env, start=start) assert ( abs(traj_opt_return + loss) < 0.001 ), f"Rollout={traj_opt_return} and loss={loss}, differ by too much. start={start}, reward={reward_weights}" returns[i, 0] = traj_opt_return returns[i, 1] = policy_return logging.debug( f"Traj opt return={traj_opt_return}, loss={loss}, policy_return={policy_return}, delta={traj_opt_return-policy_return}" ) np.save(experiment_dir / "returns.npy", returns) deltas = returns[:, 0] - returns[:, 1] logging.info( f"Mean delta={np.mean(deltas)}, mean better={np.mean(deltas > 0)*100:.1f}%, optim={optim}, lr={lr}, n={plan_iters}, momentum={momentum}, nesterov={nesterov}, extra inits={extra_inits}" ) plot_returns(returns, experiment_dir)