def batch_evaluate(agent, env_name, seed, episodes, return_obss_actions=False, pixel=False): num_envs = min(256, episodes) envs = [] for i in range(num_envs): env = gym.make(env_name) if pixel: env = RGBImgPartialObsWrapper(env) envs.append(env) env = ManyEnvs(envs) logs = { "num_frames_per_episode": [], "return_per_episode": [], "observations_per_episode": [], "actions_per_episode": [], "seed_per_episode": [] } for i in tqdm(range((episodes + num_envs - 1) // num_envs)): seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs) env.seed(seeds) # Reset agent. if hasattr(agent, 'reset'): agent.reset() many_obs = env.reset() cur_num_frames = 0 num_frames = np.zeros((num_envs,), dtype='int64') returns = np.zeros((num_envs,)) already_done = np.zeros((num_envs,), dtype='bool') if return_obss_actions: obss = [[] for _ in range(num_envs)] actions = [[] for _ in range(num_envs)] while (num_frames == 0).any(): action = agent.act_batch(many_obs)['action'] if return_obss_actions: for i in range(num_envs): if not already_done[i]: obss[i].append(many_obs[i]) actions[i].append(action[i].item()) many_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) done = np.array(done) just_done = done & (~already_done) returns += reward * just_done cur_num_frames += 1 num_frames[just_done] = cur_num_frames already_done[done] = True logs["num_frames_per_episode"].extend(list(num_frames)) logs["return_per_episode"].extend(list(returns)) logs["seed_per_episode"].extend(list(seeds)) if return_obss_actions: logs["observations_per_episode"].extend(obss) logs["actions_per_episode"].extend(actions) return logs
def __init__(self, name, horizon=None, gamma=0.99, history_length=4, fixed_seed=None, use_pixels=False): """ Constructor. Args: name (str): name of the environment; horizon (int, None): the horizon; gamma (float, 0.99): the discount factor; history_length (int, 4): number of frames to form a state; fixed_seed (int, None): if passed, it fixes the seed of the environment at every reset. This way, the environment is fixed rather than procedurally generated; use_pixels (bool, False): if True, MiniGrid's default 7x7x3 observations is converted to an image of resolution 56x56x3. """ # MDP creation self._not_pybullet = True self._first = True env = gym.make(name) obs_high = 10. if use_pixels: env = RGBImgPartialObsWrapper(env) # Get pixel observations obs_high = 255. env = ImgObsWrapper(env) # Get rid of the 'mission' field self.env = env self._fixed_seed = fixed_seed self._img_size = env.observation_space.shape[0:2] self._history_length = history_length # Get the default horizon if horizon is None: horizon = self.env.max_steps # MDP properties action_space = Discrete(self.env.action_space.n) observation_space = Box( low=0., high=obs_high, shape=(history_length, self._img_size[1], self._img_size[0])) self.env.max_steps = horizon + 1 # Hack to ignore gym time limit (do not use np.inf, since MiniGrid returns r(t) = 1 - 0.9t/T) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) Environment.__init__(self, mdp_info) self._state = None
parser.add_argument("--save-interval", type=int, default=50, help="number of updates between two saves (default: 50, 0 means no saving)") args = parser.parse_args() utils.seed(args.seed) if os.environ.get("TORCH_DETECT_ANOMALY", None): torch.set_anomaly_enabled(True) # Generate environments envs = [] use_pixel = 'pixel' in args.arch for i in range(args.procs): env = gym.make(args.env) if use_pixel: env = RGBImgPartialObsWrapper(env) env.seed(100 * args.seed + i) envs.append(env) # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" mem = "mem" if not args.no_mem else "nomem" model_name_parts = { 'env': args.env, 'algo': args.algo, 'arch': args.arch, 'instr': instr, 'mem': mem, 'seed': args.seed, 'info': '',
"pagedown": "drop", " ": "toggle" } assert args.model is not None or args.demos is not None, "--model or --demos must be specified." # if args.seed is None: # args.seed = 0 if args.model is not None else 1 # Set seed for all randomness sources utils.seed(args.seed) # Generate environment env = gym.make(args.env) if args.model is not None and 'pixel' in args.model: env = RGBImgPartialObsWrapper(env) env.seed(args.seed) global obs obs = env.reset() print("Mission: {}".format(obs["mission"])) # Define agent agent = utils.load_agent(env, args.model, args.demos, args.demos_origin, args.argmax, args.env) # Run the agent done = True action = None
def make_env(args, dream_env: bool = False, seed: Optional[int] = None, keep_image: bool = False, wrap_rnn: bool = True, load_model: bool = True): # Prepares an environment that matches the expected format: # - The environment returns a 64x64 image in observation["image"] # and camera data (x, y, z, pitch, yaw) in observation["camera"] # - If wrapped in the RNN, observation["features"] returns the RNN output to be used for the controller # - A dream environment simulates the actual environment using the RNN. It never returns an image # (because the actual environment doesn't get run) and only returns the features # - A wrapped environment always returns the features, and can return the original image when keep_image is True full_episode = args.full_episode # Initialize VAE and MDNRNN networks if dream_env or wrap_rnn: features_mode = FeatureMode.MODE_ZCH if args.state_space == 2 else FeatureMode.MODE_ZH if args.use_gqn: encoder = GenerativeQueryNetwork(args.gqn_x_dim, args.gqn_r_dim, args.gqn_h_dim, args.gqn_z_dim, args.gqn_l, name="gqn") encoder_path = get_path(args, "tf_gqn") else: encoder = CVAE(args) encoder_path = get_path(args, "tf_vae") rnn = MDNRNN(args) rnn_path = get_path(args, "tf_rnn") # TODO: Is this still needed? Do we ever NOT load the model? if load_model: encoder.load_weights(str(encoder_path)) rnn.load_weights(str(rnn_path)) if dream_env: assert keep_image is False, "Dream environment doesn't support image observations" import json initial_z_dir = get_path(args, "tf_initial_z") if args.use_gqn: initial_z_path = initial_z_dir / "initial_z_gqn.json" with open(str(initial_z_path), 'r') as f: initial_z = json.load(f) else: initial_z_path = initial_z_dir / "initial_z_vae.json" with open(str(initial_z_path), 'r') as f: [initial_mu, initial_logvar] = json.load(f) # This could probably be done more efficiently initial_z = np.array([list(elem) for elem in zip(initial_mu, initial_logvar)], dtype=np.float) # Create dream environment # noinspection PyUnboundLocalVariable env = DreamEnv(initial_z, args.z_size, rnn, features_mode) else: # Create real environment kwargs = {} if args.env_name.startswith("VizdoomTakeCover"): kwargs["position"] = True # Include position data as observation for Vizdoom environment print("Making environment {}...".format(args.env_name)) env = gym.make(args.env_name, **kwargs) print("Raw environment:", env) from gym.envs.box2d import CarRacing from vizdoomgym.envs import VizdoomTakeCover from gym_minigrid.minigrid import MiniGridEnv if isinstance(env.unwrapped, CarRacing): # Accept actions in the required format env = CarRacingActionWrapper(env) # Transform CarRacing observations into expected format and add camera data env = CarRacingObservationWrapper(env) # Cut off "status bar" at the bottom of CarRacing observation (copied from original paper) env = ClipPixelObservationWrapper(env, (slice(84),)) elif isinstance(env.unwrapped, VizdoomTakeCover): # Accept actions in the required format env = VizdoomTakeCoverActionWrapper(env) # Transform Vizdoom observations into expected format env = VizdoomObservationWrapper(env) # Cut off "status bar" at the bottom of the screen (copied from original paper) env = ClipPixelObservationWrapper(env, (slice(400),)) elif isinstance(env.unwrapped, MiniGridEnv): from gym_minigrid.wrappers import RGBImgPartialObsWrapper # Accept actions in the required format env = MiniGridActionWrapper(env) # Get RGB image observations from the agent's viewpoint # (7x7 grid of tiles, with tile size 9 this results in a 63x63 image) env = RGBImgPartialObsWrapper(env, tile_size=9) # Add camera data to the observation env = MiniGridObservationWrapper(env) # Pad image to 64x64 to match the requirements (in effect just adding one row at the right and bottom edge # with repeated values from the edge) env = PadPixelObservationWrapper(env, target_size=64) else: env = PixelObservationWrapper(env, pixel_keys=("image",)) if env.observation_space["image"].shape[:2] != (64, 64): # Resize image to 64x64 env = ResizePixelObservationWrapper(env, size=(64, 64)) # Wrap in RNN to add features to observation if wrap_rnn: # noinspection PyUnboundLocalVariable env = MDNRNNWrapper(env, encoder, rnn, keep_image=keep_image, features_mode=features_mode) # TODO: Is this needed? It was only ever implemented for CarRacing and didn't work # Force done=False if full_episode is True if full_episode: env = NoEarlyStopWrapper(env) # Set seed if given if seed is not None: env.seed(seed) print("Wrapped environment:", env) return env
if reward_mean > 500: break def play(self, num_episodes, render=True): """Test the trained agent. """ for episode in range(num_episodes): state = self.env.reset() total_reward = 0.0 while True: if render: self.env.render() action = self.get_action(state) state, reward, done, _ = self.env.step(action) total_reward += reward if done: print( f"Total reward: {total_reward} in episode {episode + 1}" ) break if __name__ == "__main__": env = gym.make("MiniGrid-Empty-8x8-v0") env = RGBImgPartialObsWrapper(env) # Get pixel observations env = ImgObsWrapper(env) # Get rid of the 'mission' field agent = Agent(env) print("Number of actions: ", agent.actions) agent.train(percentile=99.9, num_iterations=64, num_episodes=128) agent.play(num_episodes=3)
) parser.add_argument( "--tile_size", type=int, help="size at which to render tiles", default=32 ) parser.add_argument( '--agent_view', default=False, help="draw the agent sees (partially observable view)", action='store_true' ) args = parser.parse_args() env = gym.make(args.env) if args.agent_view: env = RGBImgPartialObsWrapper(env) env = ImgObsWrapper(env) print(dijkstras(env)) window = Window('gym_minigrid - ' + args.env) window.reg_key_handler(key_handler) reset() # Blocking event loop window.show(block=True)
def generate_demos(n_episodes, valid, seed, shift=0): utils.seed(seed) # Generate environment env = gym.make(args.env) use_pixels = args.pixels if use_pixels: env = RGBImgPartialObsWrapper(env) agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax, args.env) demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid) demos = [] checkpoint_time = time.time() just_crashed = False while True: if len(demos) == n_episodes: break done = False if just_crashed: logger.info( "reset the environment to find a mission that the bot can solve" ) env.reset() else: env.seed(seed + len(demos)) obs = env.reset() agent.on_reset() actions = [] mission = obs["mission"] images = [] directions = [] try: while not done: action = agent.act(obs)['action'] if isinstance(action, torch.Tensor): action = action.item() new_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) actions.append(action) images.append(obs['image']) if use_pixels: directions.append(None) else: directions.append(obs['direction']) obs = new_obs if reward > 0 and (args.filter_steps == 0 or len(images) <= args.filter_steps): demos.append((mission, blosc.pack_array(np.array(images)), directions, actions)) just_crashed = False if reward == 0: if args.on_exception == 'crash': raise Exception( "mission failed, the seed is {}".format(seed + len(demos))) just_crashed = True logger.info("mission failed") except (Exception, AssertionError): if args.on_exception == 'crash': raise just_crashed = True logger.exception("error while generating demo #{}".format( len(demos))) continue if len(demos) and len(demos) % args.log_interval == 0: now = time.time() demos_per_second = args.log_interval / (now - checkpoint_time) to_go = (n_episodes - len(demos)) / demos_per_second logger.info( "demo #{}, {:.3f} demos per second, {:.3f} seconds to go". format(len(demos) - 1, demos_per_second, to_go)) checkpoint_time = now # Save demonstrations if args.save_interval > 0 and len( demos) < n_episodes and len(demos) % args.save_interval == 0: logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) # print statistics for the last 100 demonstrations print_demo_lengths(demos[-100:]) # Save demonstrations logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) print_demo_lengths(demos[-100:])
def batch_evaluate(agent, env_name, seed, episodes, return_obss_actions=False, pixel=False, monitor_gym=False, pairs_dict=None, model_path=None): num_envs = min(256, episodes) envs = [] for i in range(num_envs): if '_c' in env_name: env = gym.make(env_name, pairs_dict=pairs_dict, test_mode=True) else: env = gym.make(env_name) if pixel: env = RGBImgPartialObsWrapper(env) if monitor_gym: demo_path = os.path.join(model_path, 'gym_demos') if not i % 64: env = Monitor( env, demo_path, video_callable=lambda episode_id: episode_id == 1, force=True) else: env = Monitor(env, demo_path, video_callable=False, force=True) envs.append(env) env = ManyEnvs(envs) logs = { "num_frames_per_episode": [], "return_per_episode": [], "observations_per_episode": [], "actions_per_episode": [], "seed_per_episode": [], "seen_missions": [env.mission for env in envs] } for i in range((episodes + num_envs - 1) // num_envs): seeds = range(seed + i * num_envs, seed + (i + 1) * num_envs) env.seed(seeds) many_obs = env.reset() cur_num_frames = 0 num_frames = np.zeros((num_envs, ), dtype='int64') returns = np.zeros((num_envs, )) already_done = np.zeros((num_envs, ), dtype='bool') if return_obss_actions: obss = [[] for _ in range(num_envs)] actions = [[] for _ in range(num_envs)] while (num_frames == 0).any(): action = agent.act_batch(many_obs)['action'] if return_obss_actions: for i in range(num_envs): if not already_done[i]: obss[i].append(many_obs[i]) actions[i].append(action[i].item()) many_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) done = np.array(done) just_done = done & (~already_done) returns += reward * just_done cur_num_frames += 1 num_frames[just_done] = cur_num_frames already_done[done] = True logs["num_frames_per_episode"].extend(list(num_frames)) logs["return_per_episode"].extend(list(returns)) logs["seed_per_episode"].extend(list(seeds)) if return_obss_actions: logs["observations_per_episode"].extend(obss) logs["actions_per_episode"].extend(actions) return logs
def BobEnv(size): return ImgObsWrapper(RGBImgPartialObsWrapper(_BobEnv(size)))
def __init__( self, args, ): self.args = args utils.seed(self.args.seed) self.val_seed = self.args.val_seed self.use_pixel = 'pixels' in args.model # args.env is a list when training on multiple environments if getattr(args, 'multi_env', None): self.env = [gym.make(item) for item in args.multi_env] if self.use_pixel: self.env = [RGBImgPartialObsWrapper(e) for e in self.env] self.train_demos = [] for demos, episodes in zip(args.multi_demos, args.multi_episodes): demos_path = utils.get_demos_path(demos, None, None, valid=False) logger.info('loading {} of {} demos'.format(episodes, demos)) train_demos = utils.load_demos(demos_path) logger.info('loaded demos') if episodes > len(train_demos): raise ValueError( "there are only {} train demos in {}".format( len(train_demos), demos)) self.train_demos.extend(train_demos[:episodes]) logger.info('So far, {} demos loaded'.format( len(self.train_demos))) self.val_demos = [] for demos, episodes in zip(args.multi_demos, [args.val_episodes] * len(args.multi_demos)): demos_path_valid = utils.get_demos_path(demos, None, None, valid=True) logger.info('loading {} of {} valid demos'.format( episodes, demos)) valid_demos = utils.load_demos(demos_path_valid) logger.info('loaded demos') if episodes > len(valid_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(valid_demos))) self.val_demos.extend(valid_demos[:episodes]) logger.info('So far, {} valid demos loaded'.format( len(self.val_demos))) logger.info('Loaded all demos') observation_space = self.env[0].observation_space action_space = self.env[0].action_space else: self.env = gym.make(self.args.env) if self.use_pixel: self.env = RGBImgPartialObsWrapper(self.env) demos_path = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=False) demos_path_valid = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=True) logger.info('loading demos') self.train_demos = utils.load_demos(demos_path) logger.info('loaded demos') if args.episodes: if args.episodes > len(self.train_demos): raise ValueError("there are only {} train demos".format( len(self.train_demos))) self.train_demos = self.train_demos[:args.episodes] self.val_demos = utils.load_demos(demos_path_valid) if args.val_episodes > len(self.val_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(self.val_demos))) self.val_demos = self.val_demos[:self.args.val_episodes] observation_space = self.env.observation_space action_space = self.env.action_space self.obss_preprocessor = utils.select_obss_preprocessor( args.model, observation_space, getattr(self.args, 'pretrained_model', None)) # Define actor-critic model self.acmodel = utils.load_model(args.model, raise_not_found=False) if self.acmodel is None: if getattr(self.args, 'pretrained_model', None): self.acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: logger.info('Creating new model') self.acmodel = ACModel(self.obss_preprocessor.obs_space, action_space, args.image_dim, args.memory_dim, args.instr_dim, not self.args.no_instr, self.args.instr_arch, not self.args.no_mem, self.args.arch) if self.obss_preprocessor.vocab is not None: self.obss_preprocessor.vocab.save() utils.save_model(self.acmodel, args.model) self.acmodel.train() if torch.cuda.is_available(): self.acmodel.cuda() self.optimizer = torch.optim.Adam(self.acmodel.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
def partial_rgb_train(env): return RGBImgPartialObsWrapper(env, tile_size=6)