def batch_evaluate(agent, env_name, seed, episodes, return_obss_actions=False, pixel=False): num_envs = min(256, episodes) envs = [] for i in range(num_envs): env = gym.make(env_name) if pixel: env = RGBImgPartialObsWrapper(env) envs.append(env) env = ManyEnvs(envs) logs = { "num_frames_per_episode": [], "return_per_episode": [], "observations_per_episode": [], "actions_per_episode": [], "seed_per_episode": [] } for i in tqdm(range((episodes + num_envs - 1) // num_envs)): seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs) env.seed(seeds) # Reset agent. if hasattr(agent, 'reset'): agent.reset() many_obs = env.reset() cur_num_frames = 0 num_frames = np.zeros((num_envs,), dtype='int64') returns = np.zeros((num_envs,)) already_done = np.zeros((num_envs,), dtype='bool') if return_obss_actions: obss = [[] for _ in range(num_envs)] actions = [[] for _ in range(num_envs)] while (num_frames == 0).any(): action = agent.act_batch(many_obs)['action'] if return_obss_actions: for i in range(num_envs): if not already_done[i]: obss[i].append(many_obs[i]) actions[i].append(action[i].item()) many_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) done = np.array(done) just_done = done & (~already_done) returns += reward * just_done cur_num_frames += 1 num_frames[just_done] = cur_num_frames already_done[done] = True logs["num_frames_per_episode"].extend(list(num_frames)) logs["return_per_episode"].extend(list(returns)) logs["seed_per_episode"].extend(list(seeds)) if return_obss_actions: logs["observations_per_episode"].extend(obss) logs["actions_per_episode"].extend(actions) return logs
assert args.model is not None or args.demos is not None, "--model or --demos must be specified." # if args.seed is None: # args.seed = 0 if args.model is not None else 1 # Set seed for all randomness sources utils.seed(args.seed) # Generate environment env = gym.make(args.env) if args.model is not None and 'pixel' in args.model: env = RGBImgPartialObsWrapper(env) env.seed(args.seed) global obs obs = env.reset() print("Mission: {}".format(obs["mission"])) # Define agent agent = utils.load_agent(env, args.model, args.demos, args.demos_origin, args.argmax, args.env) # Run the agent done = True action = None def keyDownCb(event): global obs
def generate_demos(n_episodes, valid, seed, shift=0): utils.seed(seed) # Generate environment env = gym.make(args.env) use_pixels = args.pixels if use_pixels: env = RGBImgPartialObsWrapper(env) agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax, args.env) demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid) demos = [] checkpoint_time = time.time() just_crashed = False while True: if len(demos) == n_episodes: break done = False if just_crashed: logger.info( "reset the environment to find a mission that the bot can solve" ) env.reset() else: env.seed(seed + len(demos)) obs = env.reset() agent.on_reset() actions = [] mission = obs["mission"] images = [] directions = [] try: while not done: action = agent.act(obs)['action'] if isinstance(action, torch.Tensor): action = action.item() new_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) actions.append(action) images.append(obs['image']) if use_pixels: directions.append(None) else: directions.append(obs['direction']) obs = new_obs if reward > 0 and (args.filter_steps == 0 or len(images) <= args.filter_steps): demos.append((mission, blosc.pack_array(np.array(images)), directions, actions)) just_crashed = False if reward == 0: if args.on_exception == 'crash': raise Exception( "mission failed, the seed is {}".format(seed + len(demos))) just_crashed = True logger.info("mission failed") except (Exception, AssertionError): if args.on_exception == 'crash': raise just_crashed = True logger.exception("error while generating demo #{}".format( len(demos))) continue if len(demos) and len(demos) % args.log_interval == 0: now = time.time() demos_per_second = args.log_interval / (now - checkpoint_time) to_go = (n_episodes - len(demos)) / demos_per_second logger.info( "demo #{}, {:.3f} demos per second, {:.3f} seconds to go". format(len(demos) - 1, demos_per_second, to_go)) checkpoint_time = now # Save demonstrations if args.save_interval > 0 and len( demos) < n_episodes and len(demos) % args.save_interval == 0: logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) # print statistics for the last 100 demonstrations print_demo_lengths(demos[-100:]) # Save demonstrations logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) print_demo_lengths(demos[-100:])
def batch_evaluate(agent, env_name, seed, episodes, return_obss_actions=False, pixel=False, monitor_gym=False, pairs_dict=None, model_path=None): num_envs = min(256, episodes) envs = [] for i in range(num_envs): if '_c' in env_name: env = gym.make(env_name, pairs_dict=pairs_dict, test_mode=True) else: env = gym.make(env_name) if pixel: env = RGBImgPartialObsWrapper(env) if monitor_gym: demo_path = os.path.join(model_path, 'gym_demos') if not i % 64: env = Monitor( env, demo_path, video_callable=lambda episode_id: episode_id == 1, force=True) else: env = Monitor(env, demo_path, video_callable=False, force=True) envs.append(env) env = ManyEnvs(envs) logs = { "num_frames_per_episode": [], "return_per_episode": [], "observations_per_episode": [], "actions_per_episode": [], "seed_per_episode": [], "seen_missions": [env.mission for env in envs] } for i in range((episodes + num_envs - 1) // num_envs): seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs) env.seed(seeds) many_obs = env.reset() cur_num_frames = 0 num_frames = np.zeros((num_envs, ), dtype='int64') returns = np.zeros((num_envs, )) already_done = np.zeros((num_envs, ), dtype='bool') if return_obss_actions: obss = [[] for _ in range(num_envs)] actions = [[] for _ in range(num_envs)] while (num_frames == 0).any(): action = agent.act_batch(many_obs)['action'] if return_obss_actions: for i in range(num_envs): if not already_done[i]: obss[i].append(many_obs[i]) actions[i].append(action[i].item()) many_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) done = np.array(done) just_done = done & (~already_done) returns += reward * just_done cur_num_frames += 1 num_frames[just_done] = cur_num_frames already_done[done] = True logs["num_frames_per_episode"].extend(list(num_frames)) logs["return_per_episode"].extend(list(returns)) logs["seed_per_episode"].extend(list(seeds)) if return_obss_actions: logs["observations_per_episode"].extend(obss) logs["actions_per_episode"].extend(actions) return logs