def generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782): ppo_graph = tf.Graph() print('Created graph') observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_id), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space) venv_eval = VecExtractDictObs(venv_eval, "rgb") venv_eval = VecMonitor( venv=venv_eval, filename=None, keep_buf=100, ) venv_eval = VecNormalize(venv=venv_eval, ob=False) venv_eval = wrappers.add_final_wrappers(venv_eval) print('Created env') graph_one_vars = ppo_graph.get_all_collection_keys() model_path = wandb_save_dir+'/%d/ppo-1'%mdp_id with tf.compat.v1.Session(graph=ppo_graph,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1: with tf.compat.v1.variable_scope("model_%d"%np.random.randint(0,100000,1).item()): ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_1.run(initialize) print('Inited session') model_saver = tf.train.import_meta_graph(model_path+'.meta') model_saver.restore(sess_1, save_path=model_path) print('Restored PPO') mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval,nsteps=nsteps_rollout, param_vals='pretrained') print('Collected level data') venv_eval.close() return mb_obs_1, mb_actions_1, mb_rewards_1
def run_state_test(env_name): env_kwargs = dict(num=2, env_name=env_name, rand_seed=0) env = ProcgenGym3Env(**env_kwargs) rng = np.random.RandomState(0) actions = [ gym3.types_np.sample(env.ac_space, bshape=(env.num, ), rng=rng) for _ in range(NUM_STEPS) ] ref_rollouts = run_in_subproc(gather_rollouts, env_kwargs=env_kwargs, actions=actions) assert len(ref_rollouts) == NUM_STEPS + 1 # run the same thing a second time basic_rollouts = run_in_subproc(gather_rollouts, env_kwargs=env_kwargs, actions=actions) assert_rollouts_identical(ref_rollouts, basic_rollouts) # run but save states state_rollouts = run_in_subproc(gather_rollouts, env_kwargs=env_kwargs, actions=actions, get_state=True) assert_rollouts_identical(ref_rollouts, state_rollouts) # make sure states are the same state_rollouts_2 = run_in_subproc(gather_rollouts, env_kwargs=env_kwargs, actions=actions, get_state=True) assert_rollouts_identical(ref_rollouts, state_rollouts_2) assert_rollouts_identical(state_rollouts, state_rollouts_2) # save and restore at each timestep state_rollouts_3 = run_in_subproc( gather_rollouts, env_kwargs=env_kwargs, actions=actions, get_state=True, set_state_every_step=True, ) assert_rollouts_identical(ref_rollouts, state_rollouts_3) assert_rollouts_identical(state_rollouts, state_rollouts_3) # restore a point in the middle of the rollout and make sure that the remainder of the data looks the same offset = NUM_STEPS // 2 state_restore_rollouts = run_in_subproc( gather_rollouts, env_kwargs={ **env_kwargs, "rand_seed": 1 }, actions=actions[offset:], state=state_rollouts[offset]["state"], get_state=True, ) assert_rollouts_identical(ref_rollouts[offset:], state_restore_rollouts) assert_rollouts_identical(state_rollouts[offset:], state_restore_rollouts)
def make_env(steps_per_env): observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) if Config.FIRST_PHASE == 'exploration': # baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL) else: # baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL) if Config.SECOND_PHASE == 'exploration': # baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE) gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL) elif Config.SECOND_PHASE != "None": # baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE) gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL) else: baseline_vec_adapt = gym3_env_adapt = None venv_train = FakeEnv(gym3_env_train, observation_space, action_space) venv_train = VecExtractDictObs(venv_train, "rgb") if Config.SECOND_PHASE != "None": venv_adapt = FakeEnv(gym3_env_adapt, observation_space, action_space) venv_adapt = VecExtractDictObs(venv_adapt, "rgb") venv_train = VecMonitor( venv=venv_train, filename=None, keep_buf=100, ) if Config.SECOND_PHASE != "None": venv_adapt = VecMonitor( venv=venv_adapt, filename=None, keep_buf=100, ) venv_train = VecNormalize(venv=venv_train, ob=False) venv_train = wrappers.add_final_wrappers(venv_train) if Config.SECOND_PHASE != "None": venv_adapt = VecNormalize(venv=venv_adapt, ob=False) venv_adapt = wrappers.add_final_wrappers(venv_adapt) venv = wrappers.DistributionShiftWrapperVec(env_list=[venv_train, venv_adapt], steps_per_env=steps_per_env) else: venv = venv_train venv_adapt = venv_train = None venv.current_env_steps_left = steps_per_env return venv, venv_train, venv_adapt
def run_episode_gym3_vec_env(u): env = ProcgenGym3Env(num=population_size, env_name="heist") rewards = np.zeros(population_size) for _ in range(number_env_steps): env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num,))) rew, obs, first = env.observe() rewards += rew return rewards
def run(): env = ProcgenGym3Env(num=2, env_name="coinrun", render_mode="rgb_array") env = gym3.ViewerWrapper(env, info_key="rgb") step = 0 for i in range(100): env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num, ))) rew, obs, first = env.observe() print(f"step {step} reward {rew} first {first}") step += 1
def get_procgen_venv(*, env_id, num_envs, rendering=False, **env_kwargs): if rendering: env_kwargs["render_human"] = True env = ProcgenGym3Env(num=num_envs, env_name=env_id, **env_kwargs) print(env) env = gym3.ExtractDictObWrapper(env, "rgb") if rendering: env = gym3.ViewerWrapper(env, info_key="rgb") return env
def test_multi_speed(env_name, num_envs, benchmark): env = ProcgenGym3Env(num=num_envs, env_name=env_name) actions = np.zeros([env.num]) def rollout(max_steps): step_count = 0 while step_count < max_steps: env.act(actions) env.observe() step_count += 1 benchmark(lambda: rollout(1000))
def collect_observations(): rng = np.random.RandomState(0) env = ProcgenGym3Env(num=2, env_name=env_name, rand_seed=23) _, obs, _ = env.observe() obses = [obs["rgb"]] for _ in range(128): env.act( rng.randint(low=0, high=env.ac_space.eltype.n, size=(env.num, ), dtype=np.int32)) _, obs, _ = env.observe() obses.append(obs["rgb"]) return np.array(obses)
def run_experiment( experiment_name, environment_name, log, graph, random_seeds, n_episodes, n_steps, n_envs, epsilon, batch_sz, critic_lr, actor_lr, gamma, critic_epochs, ): exp_path = create_exp_dir(experiment_name) agent = PPO( actor_lr=actor_lr, critic_lr=critic_lr, batch_sz=batch_sz, gamma=gamma, epsilon=epsilon, critic_epochs=critic_epochs, ) # agent = RandomAgent(n_envs=n_envs) env = ProcgenGym3Env( num=n_envs, env_name="coinrun", render_mode="rgb_array", center_agent=False, num_levels=1, start_level=2, ) train(agent, env, n_episodes, n_steps) generate_graphs(agent, exp_path) print(len(agent.buffer.mean_reward)) print(np.array(agent.buffer.mean_reward).shape) print(np.stack(agent.buffer.mean_reward).shape) print(agent.buffer.mean_reward) plt.plot(agent.buffer.mean_reward) plt.show() """
def gather_rollouts(env_kwargs, actions, state=None, get_state=False, set_state_every_step=False): env = ProcgenGym3Env(**env_kwargs) if state is not None: env.callmethod("set_state", state) result = [dict(ob=env.observe(), info=env.get_info())] if get_state: result[-1]["state"] = env.callmethod("get_state") if set_state_every_step: env.callmethod("set_state", result[-1]["state"]) for act in actions: env.act(act) result.append(dict(ob=env.observe(), info=env.get_info())) if get_state: result[-1]["state"] = env.callmethod("get_state") if set_state_every_step: env.callmethod("set_state", result[-1]["state"]) return result
def make_interactive(vision, record_dir, **kwargs): info_key = None ob_key = None if vision == "human": info_key = "rgb" kwargs["render_mode"] = "rgb_array" else: ob_key = "rgb" env = ProcgenGym3Env(num=1, **kwargs) if record_dir is not None: env = VideoRecorderWrapper( env=env, directory=record_dir, ob_key=ob_key, info_key=info_key ) h, w, _ = env.ob_space["rgb"].shape return ProcgenInteractive( env, ob_key=ob_key, info_key=info_key, width=w * 12, height=h * 12, )
def make_env(level_num): venv = ProcgenGym3Env(num=num_envs, env_name=env_name, num_levels=1, start_level=level_num) return venv
distribution_mode="easy" ) #env = gym3.ViewerWrapper(env, info_key="rgb") #env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num,))) rew, obs, first = env.observe() states.append(obs) #print(f"step {step} reward {rew} first {first}") #step += 1 print(len(states)) """ env = ProcgenGym3Env( num=1, env_name="coinrun", render_mode="rgb_array", center_agent=False, num_levels=1, start_level=2, ) env = gym3.ViewerWrapper(env, info_key="rgb") for i in tqdm(range(100)): env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num, ))) rew, obs, first = env.observe() #states.append(obs) #print(f"step {step} reward {rew} first {first}") #step += 1
def ProcgenEnv(num_envs, env_name, **kwargs): return ToBaselinesVecEnv( ProcgenGym3Env(num=num_envs, env_name=env_name, **kwargs))
env_name = "heist" distribution_mode = config.environment["distribution_mode"] episode_steps = config.environment["episode_steps"] reward_sum = 0 number_validation_runs = 100 num_levels_solved = 0 for env_seed in range(number_validation_runs): env = ProcgenGym3Env(num=1, env_name=env_name, use_backgrounds=False, distribution_mode=distribution_mode, num_levels=1, start_level=env_seed, render_mode="rgb_array") env = gym3.ViewerWrapper(env, info_key="rgb") _, ob, _ = env.observe() observations = ob["rgb"] ob = ep_runner.transform_ob(observations) reward = 0 brain.reset() for i in range(episode_steps): action = brain.step(ob.flatten())
def eval_fitness(self, evaluations, episode_steps: int = 500, break_all_episodes: bool = False): """ :param evaluations: List of 3-tuples (individual, env_seed, number_of_rounds) :param episode_steps: Number of steps per episode :param break_all_episodes: When one episode is done, break all episodes :return: """ # Extract parameters, this list of lists is necessary since pool.map only accepts a single argument # See here: http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments # individual = evaluations[0] env_seed = evaluations[0][1] number_of_rounds = evaluations[0][2] brains = [] for single_evaluation in evaluations: brains.append( self.brain_class(input_size=self.input_size, output_size=self.output_size, individual=single_evaluation[0], configuration=self.brain_configuration, brain_state=self.brain_state)) fitness_total = 0 times_episodes = [] for i in range(number_of_rounds): # num_threads=8 can be set here, don't know how it effects performance yet env = ProcgenGym3Env(num=len(evaluations), env_name="heist", use_backgrounds=False, distribution_mode=self.distribution_mode, num_levels=1, start_level=env_seed + i) rew, ob, first = env.observe() observations = ob["rgb"] ob = self.transform_ob(observations) # print(torch.cuda.memory_summary(device=self.device)) # print("Memory: {}".format(torch.cuda.memory_allocated(device=self.device))) # pool = mp.get_context("spawn").Pool(processes=os.cpu_count()) fitness_current = [0] * len(evaluations) # times_actions = [] time_s = time.time() for i in range(episode_steps): # actions = pool.starmap(self.get_actions, zip(brains, ob)) # time_actions_s = time.time() actions = self.calculate_actions_trivial(brains, ob) # times_actions.append(time.time() - time_actions_s) actions = np.argmax(actions, axis=1) env.act(actions) rew, ob, first = env.observe() if any(first) and break_all_episodes: print( "break_episodes: One or more environments are done, stopping all episodes" ) break observations = ob["rgb"] ob = self.transform_ob(observations) # print(torch.cuda.memory_summary(device=self.device)) # print("Memory: {}".format(torch.cuda.memory_allocated(device=self.device))) # if i > 10: # break fitness_current += rew print("Episodes with VecEnv finished") # print("Times actions Mean {}".format(np.mean(times_actions))) # print("Times actions Std {}".format(np.std(times_actions))) # print("Times actions Max {}".format(np.max(times_actions))) # print("Times actions Min {}".format(np.min(times_actions))) times_episodes.append(time.time() - time_s) # break fitness_total += fitness_current return fitness_total / number_of_rounds, times_episodes
def main(): print('Parsing args') args = setup_utils.setup_and_load() print('Setting up MPI') comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) print('Setting config') # coinrun version, allows you to specify how many GPUs you want this run to use #utils.setup_mpi_gpus() # baselines version, just sets the number of GPUs to the -n flag #setup_mpi_gpus() os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(25e6) elif Config.SHORT_TRAINING: total_timesteps = int(8e6) elif Config.VERY_SHORT_TRAINING: total_timesteps = int(500e3) elif Config.VERY_VERY_SHORT_TRAINING: total_timesteps = int(50e3) save_interval = args.save_interval #env = utils.make_general_env(nenvs, seed=rank) #print (env) mpi_print(Config.ENVIRONMENT) venv, venv_train, venv_adapt = make_env(total_timesteps//2) #switch "easy" -> "exploration" halfway # import ipdb;ipdb.set_trace() observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) # baseline_vec_eval = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space) venv_eval = VecExtractDictObs(venv_eval, "rgb") venv_eval = VecMonitor( venv=venv_eval, filename=None, keep_buf=100, ) venv_eval = VecNormalize(venv=venv_eval, ob=False) venv_eval = wrappers.add_final_wrappers(venv_eval) with tf.compat.v1.Session(config=config) as sess: if Config.AGENT == 'ppo': from coinrun import ppo2 as agent from coinrun import policies elif Config.AGENT == 'ppo_rnd': from coinrun import ppo2_rnd as agent from coinrun import policies elif Config.AGENT == 'ppo_diayn': from coinrun import ppo2_diayn as agent from coinrun import policies elif Config.AGENT == 'ppg': from coinrun import ppo2_ppg as agent from coinrun import policies elif Config.AGENT == 'ppg_ssl': from coinrun import ppo2_ppg_ssl as agent from coinrun import policies elif Config.AGENT == 'ppo_goal': from coinrun import ppo2_goal as agent from coinrun import policies elif Config.AGENT == 'ppo_curl': from coinrun import ppo2_curl as agent from coinrun import policies elif Config.AGENT == 'ppo_goal_bogdan' or Config.AGENT == 'ppo_ctrl': from coinrun import ppo2_goal_bogdan as agent from coinrun import policies_bogdan as policies elif Config.AGENT == 'ppg_cluster': from coinrun import ppo2_ppg_sinkhorn as agent from coinrun import policies_ppg_sinkhorn as policies elif Config.AGENT == 'ppo_bisimulation': from coinrun import ppo2_bisimulation as agent from coinrun import policies_bisimulation as policies elif Config.AGENT == 'ppo_pse': from coinrun import ppo2_pse as agent from coinrun import policies_pse as policies policy = policies.get_policy() final_eprew_eval = agent.learn(policy=policy, env=venv, eval_env=venv_eval, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, #10, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, lr_ctrl=lambda f : f * Config.LEARNING_RATE_CTRL, lr_myow=lambda f : f * Config.LEARNING_RATE_MYOW, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps) return final_eprew_eval
""" Example random agent script using the gym3 API to demonstrate that procgen works """ from gym3 import types_np from procgen import ProcgenGym3Env env = ProcgenGym3Env(num=1, env_name="coinrun") step = 0 while True: env.act(types_np.sample(env.ac_space, bshape=(env.num, ))) rew, obs, first = env.observe() print(f"step {step} reward {rew} first {first}") if step > 0 and first: break step += 1
def eval(*, network, seed=None, nsteps=2048, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, policy=None, nenvs=None, ob_space=None, ac_space=None, nbatch=None, nbatch_train=None, model=None, num_trials=3, num_levels=500, start_level=0, gui=False, args=None, **network_kwargs): if load_path is not None: model.load(load_path) if init_fn is not None: init_fn() for trial in range(num_trials): # Start total timer tfirststart = time.perf_counter() logger.info('Stepping environment...') avg_reward = 0 avg_steps = 0 for num_level in tqdm(range(start_level, start_level + num_levels)): if gui: env = ViewerWrapper(ProcgenGym3Env(num=1, env_name="fruitbot", num_levels=1, start_level=num_level, distribution_mode='easy', render_mode="rgb_array"), info_key='rgb') else: env = ProcgenGym3Env(num=1, env_name="fruitbot", num_levels=1, start_level=num_level, distribution_mode='easy') _, obs, _ = env.observe() step = 0 total_reward = 0 while True: actions, _, _, _ = model.step(obs['rgb']) env.act(actions) rew, obs, first = env.observe() total_reward += rew if step > 0 and first: break step += 1 avg_reward += total_reward avg_steps += step avg_reward = avg_reward / num_levels avg_steps = avg_steps / num_levels logger.info('Done.') # End timer tnow = time.perf_counter() logger.logkv('eval_eprewmean', avg_reward) logger.logkv('eval_eplenmean', avg_steps) logger.logkv('misc/time_elapsed', tnow - tfirststart) logger.dumpkvs() return model
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) # coinrun version, allows you to specify how many GPUs you want this run to use #utils.setup_mpi_gpus() # baselines version, just sets the number of GPUs to the -n flag #setup_mpi_gpus() os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: #total_timesteps = int(120e6) total_timesteps = int(25e6) elif Config.VERY_SHORT_TRAINING: total_timesteps = int(5e6) save_interval = args.save_interval #env = utils.make_general_env(nenvs, seed=rank) #print (env) print(Config.ENVIRONMENT) baseline_vec = ProcgenEnv(num_envs=nenvs, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode="easy") gym3_env = ProcgenGym3Env(num=nenvs, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode="easy") venv = FakeEnv(gym3_env, baseline_vec) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) #sys.exit(0) with tf.Session(config=config) as sess: #env = wrappers.add_final_wrappers(env) venv = wrappers.add_final_wrappers(venv) policy = policies.get_policy() #sess.run(tf.global_variables_initializer()) ppo2.learn( policy=policy, env=venv, #env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): from procgen import ProcgenGym3Env env = ProcgenGym3Env(num=1, env_name="coinrun", render_mode="rgb_array") ia = Interactive(env, info_key="rgb", width=768, height=768) ia.run()
def learn(*, policy, env, eval_env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None): comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_size = comm.Get_size() #tf.compat.v1.disable_v2_behavior() sess = tf.compat.v1.get_default_session() if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) utils.load_all_params(sess) runner = Runner(env=env, eval_env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) eval_epinfobuf100 = deque(maxlen=100) tfirststart = time.time() active_ep_buf = epinfobuf100 eval_active_ep_buf = eval_epinfobuf100 nupdates = total_timesteps//nbatch mean_rewards = [] datapoints = [] run_t_total = 0 train_t_total = 0 can_save = False checkpoints = [32, 64] saved_key_checkpoints = [False] * len(checkpoints) if Config.SYNC_FROM_ROOT and rank != 0: can_save = False def save_model(base_name=None): base_dict = {'datapoints': datapoints} utils.save_params_in_scopes(sess, ['model'], Config.get_save_file(base_name=base_name), base_dict) # For logging purposes, allow restoring of update start_update = 0 if Config.RESTORE_STEP is not None: start_update = Config.RESTORE_STEP // nbatch z_iter = 0 curr_z = np.random.randint(0, high=Config.POLICY_NHEADS) tb_writer = TB_Writer(sess) import os os.environ["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1" os.environ["WANDB_SILENT"] = "true" run_id = np.random.randint(100000000) os.environ["WANDB_RUN_ID"] = str(run_id) group_name = "%s__%s__%f__%f" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP) name = "%s__%s__%f__%f__%d" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP, run_id) wandb.init(project='ising_generalization' if Config.ENVIRONMENT == 'ising' else 'procgen_generalization' , entity='ssl_rl', config=Config.args_dict, group=group_name, name=name, mode="disabled" if Config.DISABLE_WANDB else "online") api = wandb.Api() list_runs = api.runs("ssl_rl/procgen_generalization") single_level_runs=[run for run in list_runs if 'ppo_per_level' in run.name] non_crashed = [run for run in single_level_runs if run.state in ['running','finished']] game_runs = [run for run in non_crashed if Config.ENVIRONMENT in run.name] wandb_save_dir = '%s/%s'%(Config.RESTORE_PATH,Config.ENVIRONMENT) print('Save dir: %s'%wandb_save_dir) if not os.path.isdir(wandb_save_dir): import requests for run in game_runs: level_id = run.name.split('__')[-1] run_save_dir = wandb_save_dir + '/' + level_id if not os.path.isdir(run_save_dir): os.makedirs(run_save_dir) def save_wandb_file(name): url = "https://api.wandb.ai/files/ssl_rl/procgen_generalization/%s/%s"%(run.id,name) r = requests.get(url) with open(run_save_dir+'/%s'%name , 'wb') as fh: fh.write(r.content) save_wandb_file('checkpoint') save_wandb_file('ppo-1.data-00000-of-00001') save_wandb_file('ppo-1.index') save_wandb_file('ppo-1.meta') print('Downloaded level id %s to %s (run id: %s)' % (level_id,run_save_dir,run.id) ) print(os.listdir(run_save_dir)) # wandb.restore(wandb_save_dir+"/checkpoint",run_path='/'.join(run.path)) # load in just the graph and model parameters outside for-loop from coinrun import policies as policies_ppo ppo = policies_ppo.get_policy() ppo_graph_1, ppo_graph_2 = tf.Graph(), tf.Graph() PSE_policy = Config.PSE_POLICY if PSE_policy == 'ppo_2': levels = np.unique(os.listdir(wandb_save_dir)).astype(int) if Config.ENVIRONMENT == 'bigfish': levels = np.setdiff1d(levels,np.array([4])) pse_replay = [] for mdp_id in levels: print('Collecting MDP %d'%mdp_id) mb_obs_i, mb_actions_i, mb_rewards_i = generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782) pse_replay.append([mb_obs_i, mb_actions_i, mb_rewards_i]) for update in range(start_update+1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) # mpi_print('collecting rollouts...') run_tstart = time.time() packed = runner.run(update_frac=update/nupdates) obs, returns, masks, actions, values, neglogpacs, infos, rewards, epinfos, eval_epinfos = packed values_i = returns_i = states_nce = anchors_nce = labels_nce = actions_nce = neglogps_nce = rewards_nce = infos_nce = None """ PSE data re-collection 1. Make 2 envs for respective policies for 2 random levels """ levels = np.unique(os.listdir(wandb_save_dir)).astype(int) if Config.ENVIRONMENT == 'bigfish': levels = np.setdiff1d(levels,np.array([4])) mdp_1,mdp_2 = np.random.choice(levels,size=2,replace=False) # import ipdb;ipdb.set_trace() observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) gym3_env_eval_1 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_1), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval_1 = FakeEnv(gym3_env_eval_1, observation_space, action_space) venv_eval_1 = VecExtractDictObs(venv_eval_1, "rgb") venv_eval_1 = VecMonitor( venv=venv_eval_1, filename=None, keep_buf=100, ) venv_eval_1 = VecNormalize(venv=venv_eval_1, ob=False) venv_eval_1 = wrappers.add_final_wrappers(venv_eval_1) gym3_env_eval_2 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_2), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval_2 = FakeEnv(gym3_env_eval_2, observation_space, action_space) venv_eval_2 = VecExtractDictObs(venv_eval_2, "rgb") venv_eval_2 = VecMonitor( venv=venv_eval_2, filename=None, keep_buf=100, ) venv_eval_2 = VecNormalize(venv=venv_eval_2, ob=False) venv_eval_2 = wrappers.add_final_wrappers(venv_eval_2) def random_policy(states): actions = np.random.randint(0,15,Config.NUM_ENVS) return actions # print('Loading weights from %s'%(wandb_save_dir+'/%d/ppo-1'%mdp_1)) # with ppo_graph.as_default(): # ppo_model = ppo(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') #import ipdb;ipdb.set_trace() # NOTE: this is recreating a graph within the updates, I'm moving them outside the training loop if PSE_policy == 'ppo': print('Using pretrained PPO policy') model1_path = wandb_save_dir+'/%d/ppo-1'%mdp_1 model2_path = wandb_save_dir+'/%d/ppo-1'%mdp_2 graph_one_vars = ppo_graph_1.get_all_collection_keys() with tf.compat.v1.Session(graph=ppo_graph_1,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1: with tf.compat.v1.variable_scope("model_1"): ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_1.run(initialize) model_saver = tf.train.import_meta_graph(model1_path+'.meta') model_saver.restore(sess_1, save_path=model1_path) mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval_1,nsteps=32, param_vals='pretrained') with tf.compat.v1.Session(graph=ppo_graph_2,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_2: with tf.compat.v1.variable_scope("model_2"): ppo_model_2 = ppo(sess_2, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_2.run(initialize) model_saver = tf.train.import_meta_graph(model2_path+'.meta') model_saver.restore(sess_2, save_path=model2_path) mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(ppo_model_2,venv_eval_2,nsteps=32, param_vals='pretrained') elif PSE_policy == 'random': print('Using random uniform policy') mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(random_policy,venv_eval_1,nsteps=32, param_vals='random') mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(random_policy,venv_eval_2,nsteps=32, param_vals='random') elif PSE_policy == 'ppo_2': mdp_1,mdp_2 = np.random.choice(np.arange(len(pse_replay)),size=2,replace=False) mb_obs_1, mb_actions_1, mb_rewards_1 = pse_replay[mdp_1] mb_obs_2, mb_actions_2, mb_rewards_2 = pse_replay[mdp_2] # reshape our augmented state vectors to match first dim of observation array # (mb_size*num_envs, 64*64*RGB) # (mb_size*num_envs, num_actions) avg_value = np.mean(values) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) eval_epinfobuf100.extend(eval_epinfos) run_elapsed = time.time() - run_tstart run_t_total += run_elapsed # mpi_print('rollouts complete') mblossvals = [] # mpi_print('updating parameters...') train_tstart = time.time() mean_cust_loss = 0 inds = np.arange(nbatch) inds_pse = np.arange(1024) inds_nce = np.arange(nbatch//runner.nce_update_freq) for _ in range(noptepochs): np.random.shuffle(inds) np.random.shuffle(inds_nce) for start in range(0, nbatch, nbatch_train): sess.run([model.train_model.train_dropout_assign_ops]) end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards)) slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1)) slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2)) mblossvals.append(model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='policy')) slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards)) np.random.shuffle(inds_pse) slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1)) slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2)) model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='pse') # update the dropout mask sess.run([model.train_model.train_dropout_assign_ops]) sess.run([model.train_model.run_dropout_assign_ops]) train_elapsed = time.time() - train_tstart train_t_total += train_elapsed # mpi_print('update complete') lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: step = update*nbatch eval_rew_mean = utils.process_ep_buf(eval_active_ep_buf, tb_writer=tb_writer, suffix='_eval', step=step) rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step) ep_len_mean = np.nanmean([epinfo['l'] for epinfo in active_ep_buf]) mpi_print('\n----', update) mean_rewards.append(rew_mean_10) datapoints.append([step, rew_mean_10]) tb_writer.log_scalar(ep_len_mean, 'ep_len_mean', step=step) tb_writer.log_scalar(fps, 'fps', step=step) tb_writer.log_scalar(avg_value, 'avg_value', step=step) tb_writer.log_scalar(mean_cust_loss, 'custom_loss', step=step) mpi_print('time_elapsed', tnow - tfirststart, run_t_total, train_t_total) mpi_print('timesteps', update*nsteps, total_timesteps) # eval_rew_mean = episode_rollouts(eval_env,model,step,tb_writer) mpi_print('eplenmean', ep_len_mean) mpi_print('eprew', rew_mean_10) mpi_print('eprew_eval', eval_rew_mean) mpi_print('fps', fps) mpi_print('total_timesteps', update*nbatch) mpi_print([epinfo['r'] for epinfo in epinfobuf10]) rep_loss = 0 if len(mblossvals): for (lossval, lossname) in zip(lossvals, model.loss_names): mpi_print(lossname, lossval) tb_writer.log_scalar(lossval, lossname, step=step) mpi_print('----\n') wandb.log({"%s/eprew"%(Config.ENVIRONMENT):rew_mean_10, "%s/eprew_eval"%(Config.ENVIRONMENT):eval_rew_mean, "%s/custom_step"%(Config.ENVIRONMENT):step}) if can_save: if save_interval and (update % save_interval == 0): save_model() for j, checkpoint in enumerate(checkpoints): if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)): saved_key_checkpoints[j] = True save_model(str(checkpoint) + 'M') save_model() env.close() # import subprocess # wandb_files = os.listdir('wandb') # file_to_save = '' # for fn in wandb_files: # if str(run_id) in fn: # file_to_save = fn # break # print(file_to_save) # my_env = os.environ.copy() # my_env["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1" # subprocess.call(['wandb','sync','wandb/'+ file_to_save],env=my_env) return mean_rewards
def create_env( num_envs, *, env_kind="procgen", epsilon_greedy=0.0, reward_scale=1.0, frame_stack=1, use_sticky_actions=0, coinrun_old_extra_actions=0, **kwargs, ): if env_kind == "procgen": env_kwargs = {k: v for k, v in kwargs.items() if v is not None} env_name = env_kwargs.pop("env_name") if env_name == "coinrun_old": import coinrun from coinrun.config import Config Config.initialize_args(use_cmd_line_args=False, **env_kwargs) global coinrun_initialized if not coinrun_initialized: coinrun.init_args_and_threads() coinrun_initialized = True venv = coinrun.make("standard", num_envs) if coinrun_old_extra_actions > 0: venv = VecExtraActions( venv, extra_actions=coinrun_old_extra_actions, default_action=0 ) else: from procgen import ProcgenGym3Env import gym3 env_kwargs = { k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS } env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs) env = gym3.ExtractDictObWrapper(env, "rgb") venv = gym3.ToBaselinesVecEnv(env) elif env_kind == "atari": game_version = "v0" if use_sticky_actions == 1 else "v4" def make_atari_env(lower_env_id, num_env): env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}" def make_atari_env_fn(): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=False, clip_rewards=False) return env return SubprocVecEnv([make_atari_env_fn for i in range(num_env)]) lower_env_id = kwargs["env_id"] venv = make_atari_env(lower_env_id, num_envs) else: raise ValueError(f"Unsupported env_kind: {env_kind}") if frame_stack > 1: venv = VecFrameStack(venv=venv, nstack=frame_stack) if reward_scale != 1: venv = VecRewardScale(venv, reward_scale) venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if epsilon_greedy > 0: venv = EpsilonGreedy(venv, epsilon_greedy) venv = VecShallowCopy(venv) return venv