def run_mpc(num_envs): value_predictor = greedy_reward_predictor(mbhp=(reward_type == "mbhp")) env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True) obs, info = env.reset() count = 0 last_count = 0 if world == 2: for i in range(5): count += 1 obs, rewards, dones, info = env.step([1]) best_action = value_predictor.predict(info) stat = [] with open(log_path, "a") as logs: logs.write("=" * 5 + "\n") while count < 50000: count += 1 obs, rewards, dones, info = env.step([best_action]) if dones[0] or info[0]['x_pos'] > 3150: stat.append([count - last_count, info[0]['x_pos']]) last_count = count with open(log_path, "a") as logs: logs.write(str(info[0]['x_pos']) + '\n') print('restart') print(experiment_name) obs, info = env.reset() if world == 2: for i in range(5): count += 1 obs, rewards, dones, info = env.step([1]) best_action = value_predictor.predict(info) print('[{}] {}'.format(count, info[0]['x_pos'])) np.save(stat_path, np.array(stat)) value_predictor.close()
br = reward_map[18:, 18:, 7, :] return tl + tm + tr + ml + mr + bl + bm + br #[black, d blue, l blue, grey, white] color_map = np.array( [[0x00, 0x00, 0x00], [0xad, 0x95, 0x19], [0xe2, 0xd6, 0xa1], [0xbe, 0xba, 0xbc], [0xf2, 0xf1, 0xf1]], dtype=np.uint8)[None, None] arrow_map = np.array([[0, 0], [3.5, 0], [2.1, 2.8], [4.0, 0], [2.4, 3.2]], dtype=np.uint8)[None, None] if __name__ == "__main__": import cv2 from environments.mario_vec_env import SuperMario_Vec_Env visualizer = Visualizer() env = SuperMario_Vec_Env(1, 1, 2, wrap_atari=True) obs, info = env.reset() obs = obs[0] action = 0 cv2.namedWindow('vis') # create trackbars for color change cv2.createTrackbar('pos', 'vis', 0, 7, lambda x: None) cv2.createTrackbar('action', 'vis', 0, 4, lambda x: None) sticky_left = 0 while True: # time.sleep(0.05) obs, rewards, dones, info = env.step(np.array([action])) rgb = info[0]['rgb'] obs = obs[0]
def run_mpc(num_envs): value_predictor = reward_predictor(baseline=baseline) dynamics = dynamics_model(model_path='dynamics/mario/ckpts', mode='eval', num_envs=num_envs) env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True) last_obs, info = env.reset() last_frame = info[0]['rgb'] env.backup() value_predictor.update(info) dynamics.reset(last_obs, info) count = 0 last_actions = np.random.randint(env.action_space.n, size=num_envs) while count < 50000: count += 1 dynamics.start_planning() for _ in range(plan_step): sticky_mask = np.random.random(num_envs) < sticky_prob new_actions = np.random.randint( env.action_space.n, size=num_envs) * (1 - sticky_mask) + sticky_mask * last_actions last_actions = new_actions _, _, dones, info_hat = dynamics.step(new_actions) obs, _, _, info = env.step(new_actions) value_predictor.update(info_hat, dones) y, x = int(info_hat[0]['y'] / scale_y) + 6, int( info_hat[0]['x'] / scale_x) + 7 last_frame[max(0, min(239, y - 2)):max(0, min(239, y + 2)), max(0, min(255, x - 2)):max(0, min(255, x + 2))] = 0 obs_to_show = info[0]['rgb'] y, x = int(274 - info[0]['y_pos']) + 6, int( info[0]['screen_x_pos']) + 7 obs_to_show[max(0, min(239, y - 2)):max(0, min(239, y + 2)), max(0, min(255, x - 2)):max(0, min(255, x + 2))] = 0 frame_last = cv2.cvtColor( cv2.resize(np.uint8(last_frame * 255.0), (1024, 1024), interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR) frame = cv2.cvtColor( cv2.resize(np.uint8(obs_to_show * 255.0), (1024, 1024), interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR) videowriter.write(np.concatenate([frame_last, frame], 1)) # cv2.imshow('last_obs', frame_last) # cv2.imshow('obs', frame) # k = cv2.waitKey(5) dynamics.end_planning() best_action = value_predictor.predict() env.restore() for action in best_action: obs, rewards, dones, info = env.step(np.array([action] * num_envs)) dynamics.update(obs, info) if dones[0] or info[0]['x_pos'] > 3150: print('restart') obs, info = env.reset() dynamics.reset(obs, info) break last_frame = info[0]['rgb'] # assert np.equal(last_obs[0], last_obs[1]).all() # cv2.imshow('last1', cv2.resize(np.uint8(last_obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST)) # cv2.imshow('last2', cv2.resize(np.uint8(last_obs[1]), (512, 512), interpolation=cv2.INTER_NEAREST)) print('[{}] {}'.format(info[0]['x_pos'], count)) env.backup() value_predictor.update(info) # env.render() value_predictor.close()
def run_mpc(num_envs): value_predictor = reward_predictor(mbhp=(reward_type=="mbhp")) dynamics = dynamics_model(model_path = 'dynamics/mario/ckpts', mode = 'eval', num_envs = num_envs) env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True) obs, info = env.reset() env.backup() value_predictor.update(info) dynamics.reset(obs, info) last_count = 0 stat = [] last_actions = np.random.randint(env.action_space.n, size=num_envs) # with open(log_path, "a") as logs: # logs.write("="*5 + "\n") for count in trange(1, 50001): dynamics.start_planning() for _ in range(plan_step): sticky_mask = np.random.random(num_envs) < sticky_prob new_actions = np.random.randint(env.action_space.n, size=num_envs) * (1 - sticky_mask) + sticky_mask * last_actions last_actions = new_actions _, _, dones, info = dynamics.step(new_actions) # _, _, _, _ = env.step(new_actions) if nodeath: dones = np.zeros_like(dones) value_predictor.update(info, dones) # predictor.update(info, np.zeros(num_envs)) # videowriter.write(cv2.resize(np.uint8(obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST)) # for i in range(2): # cv2.imshow(str(i), cv2.resize(np.uint8(obs[i]), (512, 512), interpolation=cv2.INTER_NEAREST)) # k = cv2.waitKey(20) # env.render() dynamics.end_planning() best_action = value_predictor.predict() env.restore() for action in best_action: obs, rewards, dones, info = env.step(np.array([action] * num_envs)) dynamics.update(obs, info) if save_video: videowriter.write(cv2.cvtColor(cv2.resize(np.uint8(info[0]['rgb'] * 255.0), (512, 512), interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR)) if dones[0] or info[0]['x_pos'] > 3150: stat.append(info[0]['x_pos']) last_count = count # with open(log_path, "a") as logs: # logs.write(str(info[0]['x_pos']) + '\n') # print('restart') # print(experiment_name) obs, info = env.reset() dynamics.reset(obs, info) break # last_obs = obs # assert np.equal(last_obs[0], last_obs[1]).all() # cv2.imshow('last1', cv2.resize(np.uint8(last_obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST)) # cv2.imshow('last2', cv2.resize(np.uint8(last_obs[1]), (512, 512), interpolation=cv2.INTER_NEAREST)) # print('[{}] {}'.format(count, info[0]['x_pos'])) env.backup() value_predictor.update(info) # env.render() stat = np.array(stat) np.save(stat_path, stat) print('mean: {:.3f}, stderr: {:.3f}'.format(stat.mean(), stat.std() / np.sqrt(len(stat)))) value_predictor.close()