def main(): setup_utils.setup_and_load(use_cmd_line_args=False) # Make the base enviroments that we will train the agent on first, this makes 1 gym enviroment # But after each epoch a different enviroment will be choosen, currently we only use 1 enviroment, # because it works better with the dqn algorithm base_env = make('standard', num_envs=1) base_env = CoinRunVecEnvWrapper(base_env) #base_env = wrappers.add_final_wrappers(base_env) # Make the enviroment that we will attempt to transfer to transfer_enviroment = make('standard', num_envs=1) transfer_enviroment = CoinRunVecEnvWrapper(transfer_enviroment) t = int(5e3) with tf.Session(): model = make_model() print("-----\ntraining base model on training enviroment\n-----") base_statistics = run_deepq(model if model else 'cnn', base_env, total_timesteps=t, name="base") print('mean reward: ', np.mean(np.array(base_statistics['rewards']))) print("-----\ntraining transfer model on test enviroment\n-----") transfer_statistics = run_deepq(model if model else 'cnn', transfer_enviroment, total_timesteps=t, name="transfer") print('mean reward: ', np.mean(np.array(transfer_statistics['rewards']))) model = make_model() print("-----\ntraining non-transfer model on test enviroment\n-----") transfer_enviroment_base_model_statistics = run_deepq( model if model else 'cnn', transfer_enviroment, total_timesteps=t, name="transfer") print( 'mean reward: ', np.mean( np.array( transfer_enviroment_base_model_statistics['rewards']))) plot_stats(base_statistics, transfer_statistics, transfer_enviroment_base_model_statistics)
def Train(): setup_utils.setup_and_load(use_cmd_line_args=False, set_seed=3, num_levels=1, use_black_white=True, frame_stack=4) # env=make("platform",num_envs=8) env = make("platform", num_envs=8) env = CourierWrapper(env, True) env = MyReward(env) # env = VecMonitor(env) learning_rate = 3e-4 clip_range = 0.2 n_timesteps = int(1e8) hyperparmas = { 'nsteps': 256, 'noptepochs': 4, 'nminibatches': 8, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01 } act = ppo2.learn( network=MyPolicy, env=env, total_timesteps=n_timesteps, **hyperparmas, save_interval=100, log_interval=20, # value_network="copy" )
def test_coinrun(): setup_utils.setup_and_load(use_cmd_line_args=False) env = make('CoinRun-v0', num_envs=16) for _ in range(100): acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) _obs, _rews, _dones, _infos = env.step(acts) env.close()
def random_agent(num_envs=1, max_steps=100000): setup_utils.setup_and_load(use_cmd_line_args=False) env = make('standard', num_envs=num_envs) for step in range(max_steps): acts = np.array( [env.action_space.sample() for _ in range(env.num_envs)]) _obs, rews, _dones, _infos = env.step(acts) print("step", step, "rews", rews) env.close()
def test(config, agent=None, levels=5): """Test routine""" env = utils.Scalarize(make('standard', num_envs=1)) if agent is None: print("Testing numlvl {} seed {} file: {}".format( conrun_config.NUM_LEVELS, conrun_config.SET_SEED, config.model_filename)) agent = DQN(env.observation_space.shape, env.action_space.n) if config.enable_gpu and torch.cuda.is_available(): agent = agent.cuda() bestmodel_file = os.path.join(config.save_dir, config.model_filename) load_res = torch.load(bestmodel_file, map_location="cpu") agent.load_state_dict(load_res["model"]) else: config.render_play = False agent.eval() success = 0 total_steps = 0 for i in range(levels): state = env.reset() ep_reward = 0 ep_length = 0 while True: if config.render_play: env.render() state = torch.unsqueeze(torch.FloatTensor(state), 0) action = torch.max(agent.forward(state), 1)[1].data.numpy()[0] # TODO debug this next_state, reward, done, info = env.step(action) ep_length += 1 ep_reward += reward state = copy.copy(next_state) if done: print( "test episode: {} , the episode reward : {} with length : {}" .format(i, ep_reward, ep_length)) break if ep_reward > 0: success = success + 1 total_steps += ep_length print("Testing result : {} % completed. Avg. ep length : {}".format( success / levels * 100, total_steps / levels)) env.close() if success >= (levels / 2): return True return False
def multi_setup(rank, world_size, destination): dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) setup() env = make("standard", num_envs=ExpConfig.NUM_ENVS) env = add_final_wrappers(env) learn(rank, destination, env) cleanup()
def __init__(self): self.AE = AutoEncoder(args, latent_dim=args.latent_dim).double().to(device) self.AE.train() self.counter = 0 self.buffer = np.empty(args.buffer_capacity, dtype=transition) setup_utils.setup_and_load(use_cmd_line_args=False) self.env = make('standard', num_envs=args.num_envs) self.optimizer = optim.Adam(self.AE.parameters(), lr=args.lr) self.criterion = nn.MSELoss() self.step = 0
def testing(): setup_utils.setup_and_load() episodes = 10 env = Scalarize(make('standard', num_envs=1)) for i in range(episodes): env.reset() while True: env.render() action = np.random.randint(0, env.action_space.n) next_state, reward, done, info = env.step(action) if done or reward > 0: break
def create_coinrun_env(num_levels, task_id, random_seed_list): # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) try: random_seed = random_seed_list[task_id] except: random_seed = 123 setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=random_seed) env = make('standard', num_envs=1) return env
def make_coinrun(): from coinrun import setup_utils, make from coinrun_wrapper import CourierWrapper, MyReward setup_utils.setup_and_load(use_cmd_line_args=False, set_seed=3, num_levels=1, use_black_white=True, frame_stack=4) # env=make("platform",num_envs=8) env = make("platform", num_envs=256) env = CourierWrapper(env, False) env = MyReward(env) return env
def random_agent(num_envs=1, max_steps=100000): setup_utils.setup_and_load(use_cmd_line_args=True) print(Config.IS_HIGH_RES) env = make('standard', num_envs=num_envs) env.render() viewer = rendering.SimpleImageViewer() for step in range(max_steps): acts = np.array( [env.action_space.sample() for _ in range(env.num_envs)]) _obs, rews, _dones, _infos = env.step(acts) print("step", step, "rews", rews) env.render() env.close()
def random_agent(num_envs=1, max_steps=100000): #random environment # setup_utils.setup_and_load(use_cmd_line_args=False) #just test in level1 with config --run-id myrun --num-levels 1 setup_utils.setup_and_load() env = make('standard', num_envs=num_envs) imgNum = 0 for step in range(100000): env.render() #acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) foo = [1, 3] acts = np.array([random.choice(foo)]) #0: no move #1:right move #2: move but stay #3:jump #4:down #5:down #6:down # 0, 0, # +1, 0, // right # -1, 0, // left # 0, +1, // jump # +1, +1, // right - jump # -1, +1, // left - jump # 0, -1, // down(step down from a crate) print("python input action: ", acts) print("\n env.step(acts): \n") _obs, rews, _dones, _infos = env.step(acts) #todo:return distance (change _obs to distance) then condition img_input = img.imgbuffer_process(_obs, (256, 256)) if step % 50 == 0: #turn gray #todo:make coinrunMOXCS consume gray img #plt.imsave('%i.jpg' % (imgNum), img_input.mean(axis=2), cmap = "gray") # plt.imsave('%i.jpg' % (imgNum), img_input) #plt.imshow(img_input.mean(axis=2), cmap="gray") imgNum = imgNum + 1 print("imgNum:%i" % (imgNum)) print("step", step, "rews", rews) env.close()
def run(seed): setup( rand_seed=seed, num_envs=1, high_difficulty=False, num_levels=0, use_data_augmentation=False, ) env = make("standard", num_envs=1) obs = env.reset() episode_rew = 0 done = False while not done: actions, _, _ = model.get_all_values(obs) actions = actions.numpy() next_obs, rew, done, _, = env.step(actions) obs = next_obs done = done.any() if isinstance(done, np.ndarray) else done episode_rew += rew return episode_rew
def __init__(self, hparams): # only support 1 environment currently super().__init__(hparams) try: from coinrun import setup_utils, make setup_utils.setup_and_load(use_cmd_line_args=False) self._env = make('standard', num_envs=1) except ImportError as e: print(e) print("please check README for CoinRun installation instruction") exit() self.seed(1234) self._observation_space = self._env.observation_space self._action_space = self._env.action_space self._hparams.num_states = self._observation_space.shape[0] self._hparams.num_actions = self._action_space.n self._hparams.state_shape = list(self._observation_space.shape) self._hparams.action_space_type = self._action_space.__class__.__name__ self._hparams.pixel_input = True if self._hparams.reward_augmentation is not None: self._reward_augmentation = get_reward_augmentation( self._hparams.reward_augmentation)
def train(num_episodes=NUM_EPISODES, load_filename=None, save_filename=None, eval_interval=EVAL_INTERVAL, replay_capacity=REPLAY_CAPACITY, bootstrap_threshold=BOOTSTRAP, epsilon=EPSILON, eval_epsilon=EVAL_EPSILON, gamma=GAMMA, batch_size=BATCH_SIZE, num_levels=NUM_LEVELS, seed=SEED): # Set up the environment setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape print("screen size: ", screen_height, screen_width) # Are we resuming from an existing model? policy_net = None if load_filename is not None and os.path.isfile(load_filename): print("Loading model...") policy_net = torch.load(load_filename) policy_net = policy_net.to(DEVICE) print("Done loading.") else: print("Making new model.") policy_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) # Make a copy of the policy network for evaluation purposes eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) eval_net.load_state_dict(policy_net.state_dict()) eval_net.eval() # Instantiate the optimizer optimizer = None if len(list(policy_net.parameters())) > 0: optimizer = initializeOptimizer(policy_net.parameters()) # Instantiate the replay memory replay_memory = ReplayMemory(replay_capacity) steps_done = 0 # How many steps have been run eval_window = [] # Keep the last 5 episode durations best_window = float('inf') # The best average window duration to date ### Do training until episodes complete or until ^C is pressed try: print("training...") i_episode = 0 # The episode number # Stop when we reach max episodes while i_episode < num_episodes: print("episode:", i_episode, "epsilon:", epsilon) max_reward = 0 # The best reward we've seen this episode done = False # Has the game ended (timed out or got the coin) episode_steps = 0 # Number of steps performed in this episode # Initialize the environment and state env.reset() # Current screen. There is no last screen because we get velocity on the screen itself. state = get_screen(env) # Do forever until the loop breaks while not done: # Select and perform an action action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done, bootstrap_threshold) steps_done = steps_done + 1 episode_steps = episode_steps + 1 # for debugging if RENDER_SCREEN and not IN_PYNB: env.render() # Run the action in the environment if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Record if this was the best reward we've seen so far max_reward = max(reward, max_reward) # Turn the reward into a tensor reward = torch.tensor([reward], device=DEVICE) # Observe new state current_screen = get_screen(env) # Did the game end? if not done: next_state = current_screen else: next_state = None # Store the transition in memory replay_memory.push(state, action, next_state, reward) # Move to the next state state = next_state # If we are past bootstrapping we should perform one step of the optimization if steps_done > bootstrap_threshold: optimize_model(policy_net, replay_memory, optimizer, batch_size, gamma) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) # If we are done, print some statistics if done: print("duration:", episode_steps) print("max reward:", max_reward) print("total steps:", steps_done) # Should we evaluate? if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0: test_average_duration = 0 # Track the average eval duration test_average_max_reward = 0 # Track the average max reward # copy all the weights into the evaluation network eval_net.load_state_dict(policy_net.state_dict()) # Evaluate 10 times for _ in range(10): # Call the evaluation function test_duration, test_max_reward = evaluate( eval_net, eval_epsilon, env) test_average_duration = test_average_duration + test_duration test_average_max_reward = test_average_max_reward + test_max_reward test_average_duration = test_average_duration / 10 test_average_max_reward = test_average_max_reward / 10 print("Average duration:", test_average_duration) print("Average max reward:", test_average_max_reward) # Append to the evaluation window if len(eval_window) < 5: eval_window.append(test_average_duration) else: eval_window = eval_window[1:] + [test_average_duration] # Compute window average window_average = sum(eval_window) / len(eval_window) print("evaluation window:", eval_window, "window average:", window_average) # If this is the best window average we've seen, save the model if len(eval_window) >= 5 and window_average < best_window: best_window = window_average if save_filename is not None: print("Saving model...") torch.save(policy_net, save_filename) print("Done saving.") # Only increment episode number if we are done with bootstrapping if steps_done > bootstrap_threshold: i_episode = i_episode + 1 print('Training complete') except KeyboardInterrupt: print("Training interrupted") if RENDER_SCREEN and not IN_PYNB: env.render() env.close() return policy_net
def play(destination, model): model.network.pi.trainable = False model.network.value_fc.trainable = False tf.random.set_seed(984_373) destination = Path(destination).resolve() / "play" sequence_folder = destination / "sequence" images_folder = destination / "image" images_explain_folder = destination / "explain" mkdir(sequence_folder) mkdir(images_folder) mkdir(images_explain_folder) metadata = Metadata( game_name="Coin run [OpenAI]", action_names=[ "none", "right", "left", "jump", "right-jump", "left-jump", "down", ], sequence_folder="sequence", images_folder="image", explain_folder="explain", ) with open(str(destination / "metadata.json"), "w") as outfile: json.dump(metadata.as_json(), outfile) env = make("standard", num_envs=1) obs = env.reset() timestep = 0 episode_rew = 0 done = False layers_to_visit = model.get_first_last_conv_layers() while not done: obs_hires = env.render(mode="rgb_array") actions, state_value, pi_raw = model.get_all_values(obs) actions = actions.numpy() state_value = state_value.numpy() pi_raw = pi_raw.numpy() gram_cam_images = grad_cam_heatmap( model.network, obs, int(np.argmax(pi_raw)), layers_to_visit ) next_obs, rew, done, _ = env.step(actions) obs = next_obs done = done.any() if isinstance(done, np.ndarray) else done episode_rew += rew step = Step( timestep=timestep, imagename=f"{timestep:05d}", reward=float(rew), done=int(done), actions=list(map(int, actions)), state_value=float(state_value[0]), pi_raw=list(map(float, pi_raw[0])), ) cv2.imwrite( f"{str(images_folder/step.imagename)}.jpg", cv2.cvtColor(obs_hires, cv2.COLOR_RGB2BGR), ) for layers_position, gram_cam_image in zip(["first", "last"], gram_cam_images): filepath = str( images_explain_folder / f"{step.imagename}_{layers_position}.jpg" ) cv2.imwrite(filepath, cv2.cvtColor(gram_cam_image, cv2.COLOR_RGB2BGR)) with open(str(sequence_folder / f"{timestep:05d}.json"), "w") as outfile: json.dump(step.as_json(), outfile) logger.info(f"Save step: {timestep}, Reward {rew}") timestep += 1 env.close()
import numpy as np from coinrun import setup_utils, make config_args = setup_utils.setup_and_load(use_cmd_line_args=False) env = make('standard', num_envs=4) for _ in range(1000): env.render() acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) _obs, _rews, _dones, _infos = env.step(acts) env.close()
def train(num_episodes=NUM_EPISODES, load_filename=None, save_filename=None, eval_interval=EVAL_INTERVAL, replay_capacity=REPLAY_CAPACITY, bootstrap_threshold=BOOTSTRAP, epsilon=EPSILON, eval_epsilon=EVAL_EPSILON, gamma=GAMMA, batch_size=BATCH_SIZE, target_update=TARGET_UPDATE, random_seed=RANDOM_SEED, num_levels=NUM_LEVELS, seed=SEED): # Set the random seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(RANDOM_SEED) # Set up the environment setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape print("screen size: ", screen_height, screen_width) # Are we resuming from an existing model? policy_net = None if load_filename is not None and os.path.isfile( os.path.join(MODEL_PATH, load_filename)): print("Loading model...") policy_net = load_model(load_filename) policy_net = policy_net.to(DEVICE) print("Done loading.") else: print("Making new model.") policy_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) # Make a copy of the policy network for evaluation purposes eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) eval_net.load_state_dict(policy_net.state_dict()) eval_net.eval() # Target network is a snapshot of the policy network that lags behind (for stablity) target_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Instantiate the optimizer optimizer = None if len(list(policy_net.parameters())) > 0: optimizer = initializeOptimizer(policy_net.parameters()) # Instantiate the replay memory replay_memory = ReplayMemory(replay_capacity) steps_done = 0 # How many steps have been run best_eval = float('inf') # The best model evaluation to date # Do training until episodes complete print("training...") i_episode = 0 # The episode number # Stop when we reach max episodes while i_episode < num_episodes: print("episode:", i_episode, "epsilon:", epsilon) max_reward = 0 # The best reward we've seen this episode done = False # Has the game ended (timed out or got the coin) episode_steps = 0 # Number of steps performed in this episode # Initialize the environment and state env.reset() # Current screen. There is no last screen because we get velocity on the screen itself. state = get_screen(env) # Do forever until the loop breaks while not done: # Select and perform an action action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done, bootstrap_threshold) steps_done = steps_done + 1 episode_steps = episode_steps + 1 # for debugging if RENDER_SCREEN and not IN_PYNB: env.render() # Run the action in the environment if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Record if this was the best reward we've seen so far max_reward = max(reward, max_reward) # Turn the reward into a tensor reward = torch.tensor([reward], device=DEVICE) # Observe new state current_screen = get_screen(env) # Did the game end? if not done: next_state = current_screen else: next_state = None # Store the transition in memory replay_memory.push(state, action, next_state, reward) # Move to the next state state = next_state # If we are past bootstrapping we should perform one step of the optimization if steps_done > bootstrap_threshold: optimize_model( policy_net, target_net if target_update > 0 else policy_net, replay_memory, optimizer, batch_size, gamma) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) # If we are done, print some statistics if done: print("duration:", episode_steps) print("max reward:", max_reward) status, _ = episode_status(episode_steps, max_reward) print("result:", status) print("total steps:", steps_done, '\n') # Should we update the target network? if target_update > 0 and i_episode % target_update == 0: target_net.load_state_dict(policy_net.state_dict()) # Should we evaluate? if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0: test_average_duration = 0 # Track the average eval duration test_average_max_reward = 0 # Track the average max reward # copy all the weights into the evaluation network eval_net.load_state_dict(policy_net.state_dict()) # Evaluate 10 times for _ in range(EVAL_COUNT): # Call the evaluation function test_duration, test_max_reward = evaluate( eval_net, eval_epsilon, env) status, score = episode_status(test_duration, test_max_reward) test_duration = score # Set test_duration to score to factor in death-penalty test_average_duration = test_average_duration + test_duration test_average_max_reward = test_average_max_reward + test_max_reward test_average_duration = test_average_duration / EVAL_COUNT test_average_max_reward = test_average_max_reward / EVAL_COUNT print("Average duration:", test_average_duration) print("Average max reward:", test_average_max_reward) # If this is the best window average we've seen, save the model if test_average_duration < best_eval: best_eval = test_average_duration if save_filename is not None: save_model(policy_net, save_filename, i_episode) print(' ') # Only increment episode number if we are done with bootstrapping if steps_done > bootstrap_threshold: i_episode = i_episode + 1 print('Training complete') if RENDER_SCREEN and not IN_PYNB: env.render() env.close() return policy_net
def evaluate(policy_net, epsilon=EVAL_EPSILON, env=None, test_seed=SEED): setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=NUM_LEVELS, set_seed=test_seed) # Make an environment if we don't already have one if env is None: env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape # Get the network ready for evaluation (turns off some things like dropout if used) policy_net.eval() # Current screen. There is no last screen state = get_screen(env) steps_done = 0 # Number of steps executed max_reward = 0 # Max reward seen done = False # Is the game over? print("Evaluating...") while not done: # Select and perform an action action, _ = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done=0, bootstrap_threshold=0) steps_done = steps_done + 1 if RENDER_SCREEN and not IN_PYNB: env.render() # Execute the action if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Is this the best reward we've seen? max_reward = max(reward, max_reward) # Observe new state state = get_screen(env) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) print("duration:", steps_done) print("max reward:", max_reward) status, _ = episode_status(steps_done, max_reward) print("result:", status, '\n') if RENDER_SCREEN and not IN_PYNB: env.render() return steps_done, max_reward
def create_env( num_envs, *, env_kind="procgen", epsilon_greedy=0.0, reward_scale=1.0, frame_stack=1, use_sticky_actions=0, coinrun_old_extra_actions=0, **kwargs, ): if env_kind == "procgen": env_kwargs = {k: v for k, v in kwargs.items() if v is not None} env_name = env_kwargs.pop("env_name") if env_name == "coinrun_old": import coinrun from coinrun.config import Config Config.initialize_args(use_cmd_line_args=False, **env_kwargs) global coinrun_initialized if not coinrun_initialized: coinrun.init_args_and_threads() coinrun_initialized = True venv = coinrun.make("standard", num_envs) if coinrun_old_extra_actions > 0: venv = VecExtraActions( venv, extra_actions=coinrun_old_extra_actions, default_action=0 ) else: from procgen import ProcgenGym3Env import gym3 env_kwargs = { k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS } env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs) env = gym3.ExtractDictObWrapper(env, "rgb") venv = gym3.ToBaselinesVecEnv(env) elif env_kind == "atari": game_version = "v0" if use_sticky_actions == 1 else "v4" def make_atari_env(lower_env_id, num_env): env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}" def make_atari_env_fn(): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=False, clip_rewards=False) return env return SubprocVecEnv([make_atari_env_fn for i in range(num_env)]) lower_env_id = kwargs["env_id"] venv = make_atari_env(lower_env_id, num_envs) else: raise ValueError(f"Unsupported env_kind: {env_kind}") if frame_stack > 1: venv = VecFrameStack(venv=venv, nstack=frame_stack) if reward_scale != 1: venv = VecRewardScale(venv, reward_scale) venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if epsilon_greedy > 0: venv = EpsilonGreedy(venv, epsilon_greedy) venv = VecShallowCopy(venv) return venv
#!/usr/bin/env python from __future__ import print_function import sys, gym, time from coinrun import setup_utils, make from course_learn.wrappers import CourierWrapper import numpy as np setup_utils.setup_and_load( use_cmd_line_args=False, paint_vel_info=1, is_high_res=True, set_seed=3, num_levels=1, ) env = CourierWrapper(make("platform", num_envs=1, default_zoom=5.0), True) # env = make("maze", num_envs=1, default_zoom=5.0) if not hasattr(env.action_space, 'n'): raise Exception('Keyboard agent only supports discrete action spaces') ACTIONS = env.action_space.n SKIP_CONTROL = 0 # Use previous control decision SKIP_CONTROL times, that's how you # can test what skip is still usable. human_agent_action = 0 human_wants_restart = False human_sets_pause = False def key_press(key, mod): global human_agent_action, human_wants_restart, human_sets_pause
def create_coinrun_env(num_levels, random_seed): # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=random_seed) env = make('standard', num_envs=1) return env
import numpy as np from coinrun import setup_utils, make setup_utils.setup_and_load() env = make(env_id='standard', num_envs=1) for _ in range(100): acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) _obs, _rews, _dones, _infos = env.step(acts) env.close() print(_infos)