def render(env, agent, name="", record=False): if record: env = Monitor(env, './video-test/{}'.format(name), force=True, mode="evaluation") for i_episode in range(5): state = env.reset() total_reward = 0 for step, _ in enumerate(range(STEPS), start=1): state = np.expand_dims(state, axis=0) env.render() action_index = agent.act(state) action = decode_action(action_index) next_state, reward, done, info = env.step(action) if done: break state = next_state total_reward += reward print("Episode achieves total reward {}".format(total_reward))
if k == key.DOWN: a[2] = 0 env = CarRacingV1() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) isopen = True while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) #import matplotlib.pyplot as plt #plt.imshow(s) #plt.savefig("test.jpeg") steps += 1 isopen = env.render() if done or restart or isopen == False: break env.close()
def main(): toRender = { "line": 1, "circle": 1, "parabola": 0, "cycloid": 1, "random": 1, "rl": 0 } if (len(sys.argv) == 2): #read actions from file global env4list #toRender["rl"] = 1 #fin = open(sys.argv[1],"r") #line = fin.readline() env4list = np.load(sys.argv[1]) env4list = smooth(env4list) toRender["rl"] = 1 #fin.close() global gViewer gViewer = rendering.Viewer(600, 600) saveVideo = True global env0, env0theta, env0done if toRender["random"]: env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0)) if saveVideo: from gym.wrappers.monitor import Monitor env0 = Monitor(env0, './video-test', force=True) env0.reset() env0theta = 0 env0done = False env0.score_label.x = gViewer.width - 150 env0.score_label.y = gViewer.height - 10 if toRender["line"]: global env1, env1theta, env1done env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0)) if toRender["random"]: env1.setStartPosition(env0.start_position) env1done = False env1theta = math.atan( (env1.goal_position[1] - env1.start_position[1]) / (env1.goal_position[0] - env1.start_position[0])) / (math.pi) env1.reset() env1.score_label.x = gViewer.width - 150 env1.score_label.y = gViewer.height - 25 if toRender["circle"]: global env2, env2theta, env2done env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1)) if toRender["random"]: env2.setStartPosition(env0.start_position) env2done = False env2theta = 2 * math.atan( (env2.goal_position[1] - env2.start_position[1]) / (env2.goal_position[0] - env2.start_position[0])) / (math.pi) env2.reset() env2.score_label.x = gViewer.width - 150 env2.score_label.y = gViewer.height - 40 if toRender["cycloid"]: global env3, env3theta, env3done, R_cycloid, T_Cycloid env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25)) if toRender["random"]: env3.setStartPosition(env0.start_position) R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position, env3.goal_position) env3theta = 2 * math.atan( (env3.goal_position[1] - env3.start_position[1]) / (env3.goal_position[0] - env3.start_position[0])) / (math.pi) env3done = False env3.reset() env3.score_label.x = gViewer.width - 150 env3.score_label.y = gViewer.height - 55 if toRender["rl"]: global env4, env4theta, env4done env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0)) env4.reset() env4theta = 0 env4done = False env4.score_label.x = gViewer.width - 150 env4.score_label.y = gViewer.height - 70 numsteps = 1000 for i in range(numsteps): toRender["random"] and env0.render() toRender["line"] and env1.render() toRender["circle"] and env2.render() toRender["cycloid"] and env3.render() toRender["rl"] and env4.render() if toRender["random"] and not env0done: env0theta = env0.action_space.sample() _, _, env0done, _ = env0.step(np.float32(env0theta)) if toRender["line"] and not env1done: _, _, env1done, _ = env1.step(np.float32([env1theta])) if toRender["circle"] and not env2done: _, _, env2done, _ = env2.step(np.float32([env2theta])) env2theta = 2 * math.atan( (env2.goal_position[1] - env2.state[1]) / (env2.goal_position[0] - env2.state[0])) / math.pi if toRender["cycloid"] and not env3done: _, _, env3done, _ = env3.step(np.float32([env3theta])) env3theta = solveCycloid(env3.start_position, [env3.state[0], env3.state[1]]) """ if toRender["rl"] and not env5done: line = fin.readline() if line: env0theta = [float(line)] _,_,env0done,_ = env5.step(np.float32([env5theta])) else: env0done = True """ if toRender["rl"] and not env4done: if i >= len(env4list): continue env4theta = env4list[i] _, _, env4done, _ = env4.step(np.float32([env4theta])) toRender["random"] and env0.close() toRender["line"] and env1.close() toRender["circle"] and env2.close() toRender["cycloid"] and env3.close() if toRender["rl"]: pts = env4.path print(pts) coeffs = polyfit(pts) env4.close() return
def __call__(self, step_limit, solution=None, stamp=None, record=False): logger.info("Playing game %s with step_limit %d", self.game, step_limit) with torch.no_grad(): controller = Controller(self.game, self.models_dir) if solution is not None: controller.load_solution(solution) else: controller.load_state(stamp) vae = VAE(self.game, self.models_dir) vae.load_state() mdn_rnn = MDN_RNN(self.game, self.models_dir) mdn_rnn.load_state() env = gym.make(self.game.key) if self.game.wrapper is not None: env = self.game.wrapper(env) if record: env = Monitor(env, "monitor", force=True) action = torch.zeros(self.game.action_vector_size) screen = env.reset() screen = transform(screen) screen.unsqueeze_(0) z, _, _ = vae.encoder(screen) _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0)) # h = torch.tensor([[[0] * 256]], dtype=torch.float32) overall_reward = 0 steps = 0 while True: env.render() action = controller(z.squeeze(0).squeeze(0), h.squeeze(0).squeeze(0)) actual_action = self.game.transform_action(action.detach().numpy()) screen, reward, done, _ = env.step(actual_action) overall_reward += reward screen = transform(screen) screen.unsqueeze_(0) z, _, _ = vae.encoder(screen) _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0)) if done or (step_limit and steps >= step_limit): if done: logger.info("Game reached done") else: logger.info("Step limit reached") break steps += 1 env.close() # Transform reward to be useful to CMA-ES overall_reward = self.game.transform_overall_reward(overall_reward) logger.info("Game %s finished with reward %d", self.game.key, overall_reward) return overall_reward
from Controller import Controller from car_racing import CarRacing from gym.wrappers.monitor import Monitor C = Controller() for weights in [BEST_CONTROLLER_WEIGHTS, OPTIMAL_CONTROLLER_WEIGHTS]: ENV = Monitor(CarRacing(), f'{weights[:-5]}_SIM', force=True) try: C.load_parameters(weights) except: raise Exception('Train the Controller first.') done = False steps = 0 observation = ENV.reset() reward_FULL = 0 while not done and steps < MAX_STEPS: ENV.render() action = C.get_action(observation) observation, reward, done, _ = ENV.step(action) reward_FULL += reward steps += 1 ENV.close() print(f'{weights} Reward: {reward_FULL}')
def test(args, worker_id: int, global_model: torch.nn.Module, T: Value, global_reward: Value = None, optimizer: torch.optim.Optimizer = None, global_model_critic: CriticNetwork = None, optimizer_critic: torch.optim.Optimizer = None): """ Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py :param args: console arguments :param worker_id: id of worker to differentiatethem and init different seeds :param global_model: global model, which is optimized/ for split models: actor :param T: global counter of steps :param global_reward: global running reward value :param optimizer: optimizer for shared model/ for split models: actor model :param global_model_critic: optional global critic model for split networks :param optimizer_critic: optional critic optimizer for split networks :return: None """ logging.info("test worker started.") torch.manual_seed(args.seed + worker_id) if "RR" in args.env_name: env = quanser_robots.GentlyTerminating(gym.make(args.env_name)) else: if args.monitor: env = Monitor(gym.make(args.env_name), '100_test_runs', video_callable=lambda count: count % 100 == 0, force=True) else: env = gym.make(args.env_name) env.seed(args.seed + worker_id) normalizer = get_normalizer(args.normalizer, env) # get an instance of the current global model state model = copy.deepcopy(global_model) model.eval() model_critic = None if global_model_critic: model_critic = copy.deepcopy(global_model_critic) model_critic.eval() state = torch.from_numpy(env.reset()) writer = SummaryWriter(comment='_test', log_dir='experiments/runs/') start_time = time.time() t = 0 episode_reward = 0 done = False global_iter = 0 best_global_reward = -np.inf best_test_reward = -np.inf while True: # Get params from shared global model model.load_state_dict(global_model.state_dict()) if not args.shared_model: model_critic.load_state_dict(global_model_critic.state_dict()) rewards = [] eps_len = [] sleep = True # make 10 runs to get current avg performance for i in range(args.test_runs): while not done: t += 1 if not args.no_render: if i == 0 and t % 1 == 0 and "RR" not in args.env_name: env.render() if args.monitor and sleep: # add a small delay to do a screen capture of the test run if needed time.sleep(1) sleep = False # apply min/max scaling on the environment with torch.no_grad(): # select mean of normal dist as action --> Expectation if args.shared_model: _, mu, _ = model(normalizer(state)) else: mu, _ = model(normalizer(state)) action = mu.detach() state, reward, done, _ = env.step( np.clip(action.numpy(), -args.max_action, args.max_action)) done = done or t >= args.max_episode_length episode_reward += reward if done: # reset current cumulated reward and episode counter as well as env rewards.append(episode_reward) episode_reward = 0 eps_len.append(t) t = 0 state = env.reset() state = torch.from_numpy(state) # necessary to make more than one run done = False time_print = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) std_reward = np.std(rewards) rewards = np.mean(rewards) new_best = rewards > best_test_reward writer.add_scalar("reward/test", rewards, int(T.value)) writer.add_scalar("episode/length", np.mean(eps_len), int(T.value)) log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \ f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \ f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}" if new_best: # highlight messages if progress was done logging.info(log_string) best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward best_test_reward = rewards if rewards > best_test_reward else best_test_reward model_type = 'shared' if args.shared_model else 'split' save_checkpoint( { 'epoch': T.value, 'model': model.state_dict(), 'model_critic': model_critic.state_dict() if model_critic is not None else None, 'global_reward': global_reward.value, # only save optimizers if shared ones are used 'optimizer': optimizer.state_dict() if optimizer else None, 'optimizer_critic': optimizer_critic.state_dict() if optimizer_critic else None, }, path= f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar" ) else: # use by default only debug messages if no progress was reached logging.debug(log_string) global_iter += 1 # run evaluation only once in test mode if args.test: break