def __init__(self, env: Env, params: dict, model_path: str, log_path: str): """Initialize. :param env: gym environment. Assuming observation space is a tuple, where first component is from original env, and the second is temporal goal state. :param params: dict of parameters, like `default_parameters`. :param model_path: directory where to save models. :param log_path: directory where to save tensorboard logs. """ # Check if params["initialize_file"]: raise ValueError( "Initialization not supported; use resuming option") if params["action_bias"]: raise ValueError("Action bias is not maintained here") # Alias original_env = env # Load a saved agent for the action bias self.biased_agent: Optional[DQN] = None if params["action_bias"]: loading_params = dict(params) loading_params["resume_file"] = params["action_bias"] loading_params["action_bias"] = None self.biased_agent = TrainStableBaselines( env=env, params=loading_params, model_path=model_path, log_path=log_path, ).model # Collect statistics # (assuming future wrappers do not modify episodes) env = MyStatsRecorder(env=env, gamma=params["gamma"]) # Callbacks checkpoint_callback = CustomCheckpointCallback( save_path=model_path, save_freq=params["save_freq"], extra=None, ) stats_logger_callback = StatsLoggerCallback(stats_recorder=env, scope="env0") callbacks_list = [checkpoint_callback, stats_logger_callback] if params["render"]: renderer_callback = RendererCallback() callbacks_list.append(renderer_callback) # If training a passive agent log this too if params["active_passive_agents"]: # Find the reward shaping env reward_shaping_env = find_wrapper(env, RewardShapingWrapper) passive_stats_env = MyStatsRecorder( env=UnshapedEnv(reward_shaping_env), gamma=params["gamma"], ) passive_stats_callback = StatsLoggerCallback( stats_recorder=passive_stats_env, scope="env1", ) callbacks_list.append(passive_stats_callback) # Make it move with the original env env = UnshapedEnvWrapper( shaped_env=env, unshaped_env=passive_stats_env, ) original_reward_getter = env.get_reward # alias else: original_reward_getter = None # Combine callbacks all_callbacks = CallbackList(callbacks_list) # Define or load resuming = bool(params["resume_file"]) if not resuming: # Normalizer normalized_env = NormalizeEnvWrapper( env=env, training=True, entry=0, # Only env features, not temporal goal state ) flat_env = BoxAutomataStates(normalized_env) # Saving normalizer too checkpoint_callback.saver.extra_model = normalized_env # Agent model = DQN( env=flat_env, policy=ModularPolicy, policy_kwargs={ "layer_norm": params["layer_norm"], "layers": params["layers"], "shared_layers": params["shared_layers"], "dueling": params["dueling"], }, gamma=params["gamma"], learning_rate=params["learning_rate"], train_freq=params["train_freq"], double_q=True, batch_size=params["batch_size"], buffer_size=params["buffer_size"], learning_starts=params["learning_starts"], prioritized_replay=True, target_network_update_freq=params[ "target_network_update_freq"], exploration_fraction=params["exploration_fraction"], exploration_final_eps=params["exploration_final_eps"], exploration_initial_eps=params["exploration_initial_eps"], active_passive_agents=params["active_passive_agents"], passive_reward_getter=original_reward_getter, tensorboard_log=log_path, full_tensorboard_log=False, verbose=1, ) else: # Reload model model, extra_model, counters = checkpoint_callback.load( path=params["resume_file"], ) # Restore normalizer and env normalized_env = extra_model normalized_env.set_env(env) flat_env = BoxAutomataStates(normalized_env) # Restore properties model.tensorboard_log = log_path model.num_timesteps = counters["step"] model.learning_starts = params["learning_starts"] + counters["step"] model.set_env(flat_env) model.passive_reward_getter = original_reward_getter # Store self.params = params self.resuming = resuming self.saver = checkpoint_callback self.logger = stats_logger_callback self.callbacks = all_callbacks self.model: DQN = model self.normalized_env = normalized_env self.testing_agent = model if not params[ "test_passive"] else model.passive_agent
n_cpu_tf_sess=256, buffer_size=20000, gamma=0.95, batch_size=512) load_steps = 0 if load_steps > 0: tmp_path = os.path.join('./tmp/%s' % CASE_NAME, "%d" % load_steps) del model model = DQN.load(tmp_path, learning_rate=0.00025, env=env, verbose=1, tensorboard_log="./dqn_%s_tensorboard/" % CASE_NAME) model.num_timesteps = load_steps model.learn(total_timesteps=int(time_steps), callback=callback) model.save("dqn_%s" % CASE_NAME) del model # remove to demonstrate saving and loading model = DQN.load("dqn_%s" % CASE_NAME) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()