def main(env_name): """ Run the gym test using the specified environment :param env_name: Name of the Unity environment binary to launch """ env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) try: # Examine environment parameters print(str(env)) # Reset the environment initial_observations = env.reset() if len(env.observation_space.shape) == 1: # Examine the initial vector observation print("Agent observations look like: \n{}".format( initial_observations)) for _episode in range(10): env.reset() done = False episode_rewards = 0 while not done: actions = env.action_space.sample() obs, reward, done, _ = env.step(actions) episode_rewards += reward print("Total reward this episode: {}".format(episode_rewards)) finally: env.close()
class Chaser_v1(Environment): unity_env_worker_id = 0 def __init__(self, platform): if platform == OSName.MAC: env_filename = EnvironmentName.CHASER_V1_MAC.value elif platform == OSName.WINDOWS: env_filename = EnvironmentName.CHASER_V1_WINDOWS.value else: env_filename = None self.env = UnityEnv(environment_filename=env_filename, worker_id=Chaser_v1.unity_env_worker_id, use_visual=True, multiagent=True).unwrapped self.increase_env_worker_id() super(Chaser_v1, self).__init__() self.action_shape = self.get_action_shape() self.state_shape = self.get_state_shape() self.cnn_input_height = self.state_shape[0] self.cnn_input_width = self.state_shape[1] self.cnn_input_channels = self.state_shape[2] self.observation_space = self.env.observation_space self.continuous = True @staticmethod def increase_env_worker_id(): Chaser_v1.unity_env_worker_id += 1 def get_n_states(self): n_states = 3 return n_states def get_n_actions(self): n_actions = 3 return n_actions def get_state_shape(self): return self.env.observation_space.shape def get_action_shape(self): return self.env.action_space.shape def reset(self): state = self.env.reset() return state def step(self, action): next_state, reward, done, info = self.env.step(action) adjusted_reward = reward return next_state, reward, adjusted_reward, done, info def close(self): self.env.close()
def run(): # LINUX: Disable the Unity window -> no_graphics=True env = UnityEnv(env_name, worker_id=1000, use_visual=False, uint8_visual=False, allow_multiple_visual_obs=False, no_graphics=False) # Create the agent model = PPO2(MlpPolicy, env, verbose=0, learning_rate=1.0e-4) model.learn(total_timesteps=num_episodes) env.close() print("Successfully trained")
def test_closing(env_name): """ Run the gym test and closes the environment multiple times :param env_name: Name of the Unity environment binary to launch """ try: env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) env1.close() env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) env2 = UnityEnv(env_name, worker_id=2, use_visual=False, no_graphics=True) env2.reset() finally: env1.close() env2.close()
mavg_poison = np.mean(np.array(poison_buffer)) mavg_loss = np.mean(loss_buffer) mavg_num_batteries = np.mean(np.array(num_batteries_buffer)) food_buffer = [] battery_buffer = [] poison_buffer = [] reward_buffer = [] loss_buffer = [] num_batteries_buffer = [] # Write Rolling Statistics to file with open(SAVE_TO_FOLDER + "/dfp_stats.txt", "a+") as stats_file: stats_file.write(str(GAME) + " ") stats_file.write(str(max_reward) + " ") stats_file.write(str(mavg_score) + ' ') stats_file.write(str(mavg_loss) + ' ') stats_file.write(str(var_score) + ' ') stats_file.write(str(mavg_battery) + ' ') stats_file.write(str(mavg_num_batteries) + ' ') stats_file.write(str(mavg_food) + ' ') stats_file.write(str(mavg_poison) + '\n') env.close() end = time.time() time_elapsed = end - start with open(SAVE_TO_FOLDER + "/timing_info.txt", "w") as text_file: print("Time Elapsed: {}".format(time_elapsed), file=text_file) #KOE: Made it to the end. Now test running, print out, debug, etc.
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = UnityEnv(env_id, rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
class Worker(object): def __init__(self, wid): self.wid = wid self.lock = threading.Lock() self.env = UnityEnv(GameDir, wid,use_visual=True) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_COUNTER t = 0 while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r, buffer_v ,buffer_done = [], [], [], [], [] done = False while not done: if not COLLECT_EVENT.is_set(): COLLECT_EVENT.wait() buffer_s, buffer_a, buffer_r, buffer_v ,buffer_done = [], [], [], [], [] a,v = self.ppo.choose_action(s) s_, r, done, _ = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) buffer_v.append(v) buffer_done.append(done) s = s_ ep_r += r t+=1 GLOBAL_COUNTER += 1 # update ppo if (done or GLOBAL_COUNTER >= BATCH): t = 0 rewards = np.array(buffer_r) v_final = [v * (1 - done)] terminals = np.array(buffer_done + [done]) values = np.array(buffer_v + v_final) delta = rewards + GAMMA * values[1:] * (1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) returns = advantage + np.array(buffer_v) advantage = (advantage - advantage.mean()) / np.maximum(advantage.std(), 1e-6) bs, ba, br,badv = np.reshape(buffer_s, (-1,) + self.ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) buffer_s, buffer_a, buffer_r = [], [], [] buffer_v, buffer_done = [], [] COLLECT_EVENT.wait() self.lock.acquire() for i in range(len(bs)): GLOBAL_DATA["state"].append(bs[i]) GLOBAL_DATA["reward"].append(br[i]) GLOBAL_DATA["action"].append(ba[i]) GLOBAL_DATA["advantage"].append(badv[i]) self.lock.release() if GLOBAL_COUNTER >= BATCH and len(GLOBAL_DATA["state"])>= BATCH: COLLECT_EVENT.clear() UPDATE_EVENT.set() # self.ppo.update(bs, ba, br,badv) if GLOBAL_EP >= EP_MAX: self.env.close() COORD.request_stop() break print("episode = {}, ep_r = {}, wid = {}".format(GLOBAL_EP,ep_r,self.wid)) GLOBAL_EP += 1 if GLOBAL_EP != 0 and GLOBAL_EP % 500 == 0: self.ppo.save_model(steps=GLOBAL_EP)
UPDATE_EVENT.set() # self.ppo.update(bs, ba, br,badv) if GLOBAL_EP >= EP_MAX: self.env.close() COORD.request_stop() break print("episode = {}, ep_r = {}, wid = {}".format(GLOBAL_EP,ep_r,self.wid)) GLOBAL_EP += 1 if GLOBAL_EP != 0 and GLOBAL_EP % 500 == 0: self.ppo.save_model(steps=GLOBAL_EP) if __name__ == '__main__': tmpenv = UnityEnv(GameDir, 0,use_visual=True).unwrapped GLOBAL_PPO = PPO(tmpenv,ModelPath=modelPath) tmpenv.close() GLOBAL_DATA = {"state":[],"action":[],"reward":[],"advantage":[]} UPDATE_EVENT, COLLECT_EVENT = threading.Event(), threading.Event() UPDATE_EVENT.clear() COLLECT_EVENT.set() workers = [Worker(wid=i) for i in range(1,N_WORKER+1)] GLOBAL_COUNTER, GLOBAL_EP = 0, 0 COORD = tf.train.Coordinator() threads = [] for worker in workers: t = threading.Thread(target=worker.work, args=()) t.start() threads.append(t) threads.append(threading.Thread(target=GLOBAL_PPO.update,))
class Drone_Racing(Environment): worker_id = 0 def __init__(self, platform): if platform == OSName.MAC: env_filename = EnvironmentName.DRONE_RACING_MAC.value elif platform == OSName.WINDOWS: env_filename = EnvironmentName.DRONE_RACING_WINDOWS.value else: env_filename = None self.env = UnityEnv( environment_filename=env_filename, worker_id=randrange(65536), use_visual=False, multiagent=False ).unwrapped super(Drone_Racing, self).__init__() Drone_Racing.worker_id += 1 self.action_shape = self.get_action_shape() self.action_space = self.env.action_space self.continuous = False self.skipping_state_fq = 3 self.skipping_state_index = 0 self.WIN_AND_LEARN_FINISH_SCORE = 200 def get_n_states(self): return self.env.observation_space.shape[0] def get_n_actions(self): return self.env.action_space.shape[0] def get_state_shape(self): return self.env.observation_space def get_action_shape(self): return self.env.action_space def get_action_space(self): return self.env.action_space @property def action_meanings(self): action_meanings = ["FORWARD", "BACKWARD", "RIGHT", "LEFT", "UP", "DOWN", "R_ROTATE", "L_ROTATE", "HOVER"] return action_meanings def reset(self): state = self.env.reset() return state def step(self, action): action_list = [0] * 9 if self.is_skip_phase(): action_list[8] = 1 # hover action else: action_list[action] = 1 next_state, reward, done, info = self.env.step(action_list) adjusted_reward = reward info["skipping"] = True if not self.is_skip_phase(): self.skipping_state_index = 0 info["skipping"] = False self.skipping_state_index += 1 return next_state, reward, adjusted_reward, done, info def render(self): self.env.render() def close(self): self.env.close() def is_skip_phase(self): return self.skipping_state_index != self.skipping_state_fq