def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # DiscreteWrapper just converts wheel velocities to high level discrete actions. return DiscreteWrapper(simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th # register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = PPOTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
class PPOrl(object): def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = PPOTrainer(config=self.config, env=env) def fit(self, checkpoint=None, n_iter=2000, save_checkpoint=10): if checkpoint is None: checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl') for idx in trange(n_iter): result = self.agent.train() LOGGER.warning('result: ', result) if (idx + 1) % save_checkpoint == 0: LOGGER.warning('Save checkpoint at: {}'.format(idx + 1)) state = self.agent.save_to_object() with open(checkpoint, 'wb') as fp: pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL) return result def predict(self, checkpoint=None): if checkpoint is not None: with open(checkpoint, 'rb') as fp: state = pickle.load(fp) self.agent.restore_from_object(state) done = False episode_reward = 0 obs = self.env.reset() actions = [] while not done: action = self.agent.compute_action(obs) actions.append(action) obs, reward, done, info = self.env.step(action) episode_reward += reward results = {'action': actions, 'reward': episode_reward} return results
def single_test(defaultconfig, training_trials, evaluation_trials, check, lr=0.00005, num_workers=4, num_gpus=0.25): ray.shutdown() ray.init(**ray_init_kwargs) config = ppo.DEFAULT_CONFIG.copy() if (num_gpus > 0): config["num_gpus"] = num_gpus config["num_workers"] = num_workers config["lr"] = lr config["train_batch_size"] = 8000 config["num_sgd_iter"] = 5 config["env_config"] = defaultconfig trainer = Trainer(config=config, env=qsdl.QSDEnv) for i in range(training_trials): result = trainer.train() print("train iteration", i + 1, "/", training_trials, " avg_reward =", result["episode_reward_mean"], " timesteps =", result["timesteps_total"]) if i % check == check - 1: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) avgR = 0 for i in range(evaluation_trials): env = qsdl.QSDEnv(defaultconfig) obs = env.reset() done = False while not done: action = trainer.compute_action(obs) obs, r, done, _ = env.step(action) avgR += r return avgR / evaluation_trials
rnn_config['model'], 'happy') state = dummy_model.get_initial_state() state_list.append((key, [s.detach().numpy() for s in state])) state_list = dict(state_list) iters = 100 tdiff = 0.0 tot = [0.0, 0.0] for i in range(iters): obs = ff.reset() r1 = 0.0 state2 = deepcopy(state_list) while True: policy_id = list(obs.keys())[0] logits = trainer.compute_action(obs[policy_id], state2[policy_id], policy_id=policy_id, full_fetch=False) state2[policy_id] = logits[1] turn = int(list(obs.values())[0][6]/0.025 + 0.0125) if turn < 2: action = 0 elif turn == 2: action = 6 elif len(obs) == 13: if turn == 3: action = 4 elif turn == 4: action = 7 elif turn == 5: action = 8 else:
doneAll = False i = 0 drift_data = [] timestep_data = [] while not doneAll: i += 1 done = False # env.render() observation = env.resetFromFrame(startFrame=0, resetYaw=0, startFromRef=True) # drawAxis() pause = False drift = [] while not done and not doneAll: if not pause: action = agent.compute_action(observation) observation, reward, f_done, info = env.step(action) done = f_done drift.append(calcDrift(env.robot_pos, env.starting_robot_pos, env.target)) # drawLine( # env.robot_pos, # [env.flat_env.robot.walk_target_x, env.flat_env.robot.walk_target_y, 0], # [0, 0, 1], # ) # time.sleep(1.0 / fps) # keys = pybullet.getKeyboardEvents() # if qKey in keys and keys[qKey] & pybullet.KEY_WAS_TRIGGERED: # print("QUIT") # doneAll = True
drift_data = [] timestep_data = [] while not doneAll: i += 1 done = False # env.render() observation = env.resetFromFrame(startFrame=0, resetYaw=0, startFromRef=True) # drawAxis() pause = False drift = [] while not done and not doneAll: action = dict() if(not pause): if('high_level_agent' in observation): action['high_level_agent'] = agent.compute_action(observation['high_level_agent'], policy_id='high_level_policy') # if(not pause): # pause = True else: action[env.low_level_agent_id] = agent.compute_action(observation[env.low_level_agent_id], policy_id='low_level_policy') observation, reward, f_done, info = env.step(action) done = f_done['__all__'] == True drift.append(calcDrift(env.robot_pos, env.starting_robot_pos, env.target)) # if(f_done['__all__']): # print("Done") # targetHL = np.array([ # np.cos(env.highLevelDegTarget), # np.sin(env.highLevelDegTarget), # 0 # ]) * 5 # drawLine(env.robot_pos, env.robot_pos + targetHL, [0, 1, 0])
"agent_names": agent_names, "env_id": env_id, "phase": 0 }) for i in range(1): obs = env.reset() done = False step = 0 while not done: env.render() actions = env.act(obs) actions[1] = ppo_agent.compute_action(observation=penv.featurize( obs[1]), policy_id="ppo_policy") actions[3] = ppo_agent.compute_action(observation=penv.featurize( obs[3]), policy_id="ppo_policy") obs, reward, done, info = env.step(actions) features = penv.featurize(obs[1]) for i in range(13): print("i:", i) print(features["board"][:, :, i]) print("======") print(obs[1]["board"]) print() print(obs[1]["bomb_life"]) print("step:", step)
# env = Monitor(env, "gym_monitor_results", write_upon_reset=True, force=True) env = launch_and_wrap_duckieenv(config["env_config"], seed) print(env) # config['env_config'] del 'action_type' for i in range(1): steps = 0 env=env.env.env.env print("After unwrapping, env is {}".format(env)) obs = env.reset() env.render(render_mode) print("env's seed is {}".format(env.seed_value)) print("default robot speed is {}".format(env.robot_speed)) print("frame rate is {}".format(env.frame_rate)) done = False while not done: action = trainer.compute_action(obs, explore=False) tuple_action = np.clip(np.array([1 + action[0], 1 - action[0]]), 0., 1.) angle_vel_action = convert_to_vel_angle_actions(tuple_action) # angle_vel_action[0] = min(angle_vel_action[0], 0.58) actions.append(convert_to_vel_angle_actions(tuple_action)) # print("angle_vel_action is {}".format(angle_vel_action)) # print("action is {}".format(action)) # print("angle_vel action is {}".format(angle_vel_action)) obs, reward, done, info = env.step(angle_vel_action) total_reward+=reward # print("obs has shape{}".format(obs.shape)) # print("speed is {}".format(env.speed)) print("reward is {}".format(reward)) steps += 1
# Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment("connect_four") while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x")) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if player_id == human_player: action = ask_user_for_action(time_step) else: obs = np.array( time_step.observations["info_state"][player_id]) action = trainer.compute_action(obs, policy_id="main") # In case computer chooses an invalid action, pick a # random one. legal = time_step.observations["legal_actions"][player_id] if action not in legal: action = np.random.choice(legal) time_step = env.step([action]) print(f"\n{env.get_state}") print(f"\n{env.get_state}") print("End of game!") if time_step.rewards[human_player] > 0: print("You win") elif time_step.rewards[human_player] < 0: print("You lose")
# fdb733b6 checkpoint = 600 checkpoint_dir = "/home/lucius/ray_results/two_policies_vs_static_agents/PPO_RllibPomme_0_2020-06-09_23-39-347whmqdrs" ppo_agent.restore("{}/checkpoint_{}/checkpoint-{}".format( checkpoint_dir, checkpoint, checkpoint)) agent_list = [] for agent_id in range(4): agent_list.append(agents.StaticAgent()) env = pommerman.make("PommeTeam-v0", agent_list=agent_list) for i in range(1): obs = env.reset() done = False while not done: env.render() actions = env.act(obs) actions[0] = ppo_agent.compute_action(observation=featurize(obs[0]), policy_id="policy_0") actions[2] = ppo_agent.compute_action(observation=featurize(obs[2]), policy_id="policy_0") obs, reward, done, info = env.step(actions) print("reward:", reward) print("done:", done) print("info:", info) print("=========") env.render(close=True) # env.close()
config2 = DEFAULT_CONFIG.copy() config2['num_workers'] = 4 config2['num_sgd_iter'] = 30 config2['sgd_minibatch_size'] = 128 config2['model']['fcnet_hiddens'] = [100, 100] config2['num_cpus_per_worker'] = 0 agent2 = PPOTrainer(config2, 'CartPole-v0') for i in range(2): result = agent2.train() print(pretty_print(result)) checkpoint_path = agent2.save() print(checkpoint_path) trained_config = config2.copy() test_agent = PPOTrainer(trained_config, 'CartPole-v0') test_agent.restore(checkpoint_path) env = gym.make('CartPole-v0') state = env.reset() done = False cumulative_reward = 0 while not done: action = test_agent.compute_action(state) state, reward, done, _ = env.step(action) cumulative_reward += reward print(cumulative_reward) #tensorboard --logdir="/Users/guoqiong/ray_results/PPO_CartPole-v0_2020-04-29_18-58-22yq1yq16u/" --host=0.0.0.0
class KandboxAgentRLLibPPO(KandboxAgentPlugin): title = "Kandbox Plugin - Agent - realtime - by rllib ppo" slug = "ri_agent_rl_ppo" author = "Kandbox" author_url = "https://github.com/qiyangduan" description = "RLLibPPO for GYM for RL." version = "0.1.0" default_config = { "nbr_of_actions": 4, "n_epochs": 1000, "nbr_of_days_planning_window": 6, "model_path": "default_model_path", "working_dir": "/tmp", "checkpoint_path_key": "ppo_checkpoint_path", } config_form_spec = { "type": "object", "properties": {}, } def __init__(self, agent_config, kandbox_config): self.agent_config = agent_config self.current_best_episode_reward_mean = -99 env_config = agent_config["env_config"] if "rules_slug_config_list" not in env_config.keys(): if "rules" not in env_config.keys(): log.error("no rules_slug_config_list and no rules ") else: env_config["rules_slug_config_list"] = [ [rule.slug, rule.config] for rule in env_config["rules"] ] env_config.pop("rules", None) # self.env_class = env_class = agent_config["env"] self.kandbox_config = self.default_config.copy() self.kandbox_config.update(kandbox_config) # self.trained_model = trained_model self.kandbox_config["create_datetime"] = datetime.now() # self.trainer = None self.env_config = env_config # self.load_model(env_config=self.env_config) print( f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}" ) # import pdb # pdb.set_trace() if not ray.is_initialized(): ray.init(ignore_reinit_error=True, log_to_driver=False) # ray.init(redis_address="localhost:6379") def build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config["num_workers"] = 0 # trainer_config["train_batch_size"] = 640 # trainer_config["sgd_minibatch_size"] = 160 # trainer_config["num_sgd_iter"] = 100 trainer_config["exploration_config"] = { "type": "Random", } # EpsilonGreedy(Exploration): # trainer_config["exploration_config"] = { # "type": "Curiosity", # "eta": 0.2, # "lr": 0.001, # "feature_dim": 128, # "feature_net_config": { # "fcnet_hiddens": [], # "fcnet_activation": "relu", # }, # "sub_exploration": { # "type": "StochasticSampling", # } # } # trainer_config["log_level"] = "DEBUG" """ if env_config is not None: for x in env_config.keys(): trainer_config[x] = env_config[x] """ # trainer_config["env_config"] = copy.deepcopy(env_config) # {"rules": "qiyang_role"} trainer_config.update(self.agent_config) self.trainer = PPOTrainer(trainer_config, self.agent_config["env"]) # self.config["trainer"] = self.trainer return self.trainer def load_model(self): # , allow_empty = None env_config = self.agent_config["env_config"] self.trainer = self.build_model() # if (model_path is not None) & (os.path.exists(model_path)): if "ppo_checkpoint_path" in env_config.keys(): # raise FileNotFoundError("can not find model at path: {}".format(model_path)) if os.path.exists(env_config["ppo_checkpoint_path"]): self.trainer.restore(env_config["ppo_checkpoint_path"]) print("Reloaded model from path: {} ".format( env_config["ppo_checkpoint_path"])) else: print( "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model" .format(env_config["ppo_checkpoint_path"])) else: print( "Env_config has no ppo_checkpoint_path, returning an initial model" ) # self.config["model_path"] = model_path # self.config["trainer"] = self.trainer # self.config["policy"] = self.trainer.workers.local_worker().get_policy() self.policy = self.trainer.workers.local_worker().get_policy() return self.trainer def train_model(self): # self.trainer = self.build_model() for i in range(self.kandbox_config["n_epochs"]): result = self.trainer.train() # print(pretty_print(result)) print( "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..." .format( i, result["episodes_this_iter"], result["policy_reward_max"], result["episode_reward_max"], result["episode_reward_mean"], result["info"]["num_steps_trained"], )) if result[ "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1: model_path = self.save_model() print( "Model is saved after 10 percent increase, episode_reward_mean = {}, file = {}" .format(result["episode_reward_mean"], model_path)) self.current_best_episode_reward_mean = result[ "episode_reward_mean"] return self.save_model() def save_model(self): checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format( self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"], ) _path = self.trainer.save(checkpoint_dir=checkpoint_dir) # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format( # self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"] # ) # self.trainer.get_policy().export_model(exported_model_dir + "/1") return _path # self.trainer def predict_action(self, observation=None): action = self.trainer.compute_action(observation) return action def predict_action_list(self, env=None, job_code=None, observation=None): actions = [] if env is not None: self.env = env else: env = self.env if job_code is None: job_i = env.current_job_i else: job_i = env.jobs_dict[job_code].job_index observation = env._get_observation() # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1" # loaded_policy = tf.saved_model.load(export_dir) # loaded_policy.signatures["serving_default"](observations=observation) predicted_action = self.trainer.compute_action(observation) # V predicted_action = self.policy.compute_action(observation) for _ in range(len(env.workers)): # hist_job_workers_ranked: if len(actions) >= self.config["nbr_of_actions"]: return actions actions.append(list(predicted_action).copy()) max_i = np.argmax(predicted_action[0:len(env.workers)]) predicted_action[max_i] = 0 return actions def predict_action_dict_list(self, env=None, job_code=None, observation=None): if env is not None: self.env = env else: env = self.env curr_job = copy.deepcopy(env.jobs_dict[job_code]) if job_code is None: job_i = env.current_job_i else: job_i = curr_job.job_index env.current_job_i = job_i observation = env._get_observation() action = self.predict_action(observation=observation) action_dict = env.decode_action_into_dict_native(action=action) action_day = int(action_dict.scheduled_start_minutes / 1440) curr_job.requested_start_min_minutes = action_day * 1440 curr_job.requested_start_max_minutes = (action_day + 1) * 1440 action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day( a_worker_code_list=action.scheduled_worker_codes, curr_job=curr_job, max_number_of_matching=3, ) return action_dict_list
def fulltest(total_trials, training_trials, d, m, q, train_check, evaluation_trials=5000, lr=0.00005, num_workers=4, num_gpus=0.25, SDP=True, LG=False, local_SDP=False, dep=True, rngvec=np.ones(1000)): quantization = 20 separable = True bigvec = np.zeros((total_trials, int(training_trials / train_check) + 1)) vec_SDP = [] vec_local_SDP = [] vec_LG = [] for j in range(total_trials): print("Starting round", j, "of", total_trials) rho, _ = qsdl.generate_initial_state(d, m, rng=rngvec[j], depolarized=dep) if local_SDP == True: lg = max_SDP_sim_order(q, rho, len(d), 1250, d) vec_local_SDP.append(lg) print("local SDP-based") print(lg) if SDP == True: sdpr = sdp.SDP(rho, q, len(d)) vec_SDP.append(sdpr) print("SDP") print(sdpr) if LG == True: lg = LG_sim_order(copy.copy(q), copy.copy(rho), len(d), 2500, d) vec_LG.append(lg) print("LG") print(lg) print("RLNN: ") print(bigvec[-1]) defaultconfig = { "rho": copy.copy(rho), "q": copy.copy(q), "quantization": quantization, "d": d, "separable": True } vec = [] ray.shutdown() ray.init(**ray_init_kwargs) config = ppo.DEFAULT_CONFIG.copy() if (num_gpus > 0): config["num_gpus"] = num_gpus config["num_workers"] = num_workers config["lr"] = lr config["train_batch_size"] = 8000 config["num_sgd_iter"] = 5 config["env_config"] = defaultconfig trainer = Trainer(config=config, env=qsdl.QSDEnv) for i in range(training_trials): result = trainer.train() print("train iteration", i + 1, "/", training_trials, " avg_reward =", result["episode_reward_mean"], " timesteps =", result["timesteps_total"]) # if i % check == check-1: # checkpoint = trainer.save() # print("checkpoint saved at", checkpoint) if i == 0 or (i + 1) % train_check == 0: rew = 0 for i in range(evaluation_trials): env = qsdl.QSDEnv(defaultconfig) obs = env.reset() done = False while not done: action = trainer.compute_action(obs) obs, r, done, _ = env.step(action) rew += r vec.append(rew / evaluation_trials) bigvec[j] = vec return bigvec, vec_SDP, vec_local_SDP, vec_LG
trainer_config["train_batch_size"] = 400 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, SIR); for i in range(200): print("Training iteration {}...".format(i)) trainer.train() env = SIR() state = env.reset() done = False #max_state = -1 cumulative_reward = 0 total_states = list() while not done: action = trainer.compute_action(state) state, reward, done, results = env.step(action) #max_state = max(max_state, state) total_states.append(state) cumulative_reward += reward print("Cumulative reward you've received is: {}. Congratulations!".format(cumulative_reward)) print("Final state is", state)
def main() -> None: ray.init() np.random.seed(0) # instructions = { # 0: [Instruction(time=0, x=5, y=5)], # 1: [Instruction(time=1, x=5, y=5), Instruction(time=1, x=1, y=5)], # 2: [Instruction(time=2, x=5, y=5, rng=np.random.default_rng())], # } # task = Task( # target_x=1, # target_y=5, # instructions=instructions, # tot_frames=4, # width=42, # height=42, # ) # task = ODR(target_x=1, target_y=5, width=42, height=42) # task = Gap(target_x=1, target_y=5, width=42, height=42) task = ODRDistract(target_x=1, target_y=5, width=42, height=42) def env_creator(env_config): return Environment(env_config) # return an env instance register_env("my_env", env_creator) # trainer_config = DEFAULT_CONFIG.copy() # trainer_config["num_workers"] = 1 # trainer_config["train_batch_size"] = 20 # 100 # trainer_config["sgd_minibatch_size"] = 15 # 32 # trainer_config["num_sgd_iter"] = 50 trainer = PPOTrainer( env="my_env", config={ "env_config": {"task": task}, "framework": "torch", "num_workers": 1, "train_batch_size": 10, "sgd_minibatch_size": 5, "num_sgd_iter": 10, # "model": { # # Whether to wrap the model with an LSTM. # "use_lstm": True, # # Max seq len for training the LSTM, defaults to 20. # "max_seq_len": task.tot_frames - 1, # # # Size of the LSTM cell. # "lstm_cell_size": task.tot_frames - 1, # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # "lstm_use_prev_action_reward": False, # }, }, ) trainer = A2CTrainer( env="my_env", config={ "env_config": {"task": task}, "framework": "torch", "num_workers": 1, "train_batch_size": 10, # "model": { # # Whether to wrap the model with an LSTM. # "use_lstm": True, # # Max seq len for training the LSTM, defaults to 20. # "max_seq_len": task.tot_frames - 1, # # # Size of the LSTM cell. # "lstm_cell_size": task.tot_frames - 1, # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # "lstm_use_prev_action_reward": False, # }, }, ) # trainer = DQNTrainer( # env="my_env", # config={ # "env_config": {"task": task}, # "framework": "torch", # "num_workers": 1, # "train_batch_size": 10, # # "model": { # # # Whether to wrap the model with an LSTM. # # "use_lstm": True, # # # Max seq len for training the LSTM, defaults to 20. # # "max_seq_len": task.tot_frames - 1, # # # # Size of the LSTM cell. # # "lstm_cell_size": task.tot_frames - 1, # # # # Whether to feed a_{t-1}, r_{t-1} to LSTM. # # # # "lstm_use_prev_action_reward": False, # # }, # }, # ) env = Environment(env_config={"task": task}) for i in range(200): print(f"Training iteration {i}...") trainer.train() done = False cumulative_reward = 0.0 observation = env.reset() while not done: action = trainer.compute_action(observation) observation, reward, done, results = env.step(action) print(f"Time: {env.time}. Action: {action}") cumulative_reward += reward print( f"Last step reward: {reward: .3e}; Cumulative reward: {cumulative_reward:.3e}" )
agentLow = PPOTrainer(config_low) experiment_name = "HWalk_Low_Mimic" experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39" checkpoint_num = "1930" agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format( experiment_name, experiment_id, checkpoint_num, checkpoint_num)) # agent.export_policy_model("out/model", "default_policy") # agent.import_model("out/model") # agent.get_policy("default_policy").import_model_from_h5 agentHigh = PPOTrainer(config_hier) lowWeight = agentLow.get_policy().get_weights() highWeight = agentHigh.get_policy("low_level_policy").get_weights() importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } s1 = agentLow.get_policy().get_state() s11 = OrderedDict([(k.replace("default_policy", "low_level_policy"), v) for k, v in s1['_optimizer_variables'].items()]) importedPolicy['_optimizer_variables'] = s11 agentHigh.get_policy("low_level_policy").set_state(importedPolicy) obs = single_env.low_level_obs_space.sample() print(agentLow.compute_action(obs)) print(agentHigh.compute_action(obs, policy_id='low_level_policy')) print("=============================================================") ray.shutdown()
# ================= Enjoy a trained agent ================= t_end = 10.0 # Total duration of the simulation(s) in seconds try: env = env_creator(rllib_cfg["env_config"]) test_agent = Trainer(agent_cfg, env="my_custom_env") test_agent.restore(checkpoint_path) t_init = time.time() t_prev = t_init while t_prev - t_init < t_end: observ = env.reset() done = False cumulative_reward = 0 while not done: if not (t_prev - t_init < t_end): break action = test_agent.compute_action(observ, explore=False) observ, reward, done, _ = env.step(action) cumulative_reward += reward env.render() sleep(env.dt - (time.time() - t_prev)) t_prev = time.time() print(cumulative_reward) except KeyboardInterrupt: print("Interrupting testing...") # ================= Terminate the Ray backend ================= ray.shutdown()