def compute_gradients(self, loss, var_list, **kwargs): grads_and_vars = tf.train.AdamOptimizer.compute_gradients( self, loss, var_list, **kwargs) grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] flat_grad = tf.concat( [tf.reshape(g, (-1, )) for g, v in grads_and_vars], axis=0) if Config.is_test_rank(): flat_grad = tf.zeros_like(flat_grad) shapes = [v.shape.as_list() for g, v in grads_and_vars] sizes = [int(np.prod(s)) for s in shapes] num_tasks = self.comm.Get_size() buf = np.zeros(sum(sizes), np.float32) def _collect_grads(flat_grad): self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) np.divide(buf, float(num_tasks) * self.train_frac, out=buf) return buf avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) avg_flat_grad.set_shape(flat_grad.shape) avg_grads = tf.split(avg_flat_grad, sizes, axis=0) avg_grads_and_vars = [(tf.reshape(g, v.shape), v) for g, (_, v) in zip(avg_grads, grads_and_vars)] return avg_grads_and_vars
def dump_model(self): #utils.save_params_in_scopes(self.sess, [self.scope_dir + "model"], Config.get_save_file()) data_dict = {} save_path = utils.file_to_path(Config.get_save_file()) data_dict['args'] = Config.get_args_dict() data_dict['args']['use_minimum_model'] = True param_dict = {} if len(self.params) > 0: #print('saving scope', scope, filename) ps = self.sess.run(self.params) param_dict["model"] = ps data_dict['params'] = param_dict joblib.dump(data_dict, save_path)
def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def load_args(load_key='default'): """get train args of retore id""" load_data = Config.get_load_data(load_key) if load_data is None: return False args_dict = load_data['args'] #Config.parse_args_dict(args_dict) return args_dict
def main(): # load from restore file args_dict = utils.load_args() # train args of restore id test_args = setup_utils.setup_and_load() if 'NR' in Config.RESTORE_ID: Config.USE_LSTM = 2 if 'dropout' in Config.RESTORE_ID: Config.DROPOUT = 0 Config.USE_BATCH_NORM = 0 wandb.init(project="coinrun", notes="test", tags=["baseline", "test"], config=Config.get_args_dict()) config = tf.ConfigProto() config.gpu_options.allow_growth = True seed = np.random.randint(100000) Config.SET_SEED = seed overlap = { 'set_seed': Config.SET_SEED, 'rep': Config.REP, 'highd': Config.HIGH_DIFFICULTY, 'num_levels': Config.NUM_LEVELS, 'use_lstm': Config.USE_LSTM, 'dropout': Config.DROPOUT, 'use_batch_norm': Config.USE_BATCH_NORM } load_file = Config.get_load_filename(restore_id=Config.RESTORE_ID) mpi_print('load file name', load_file) mpi_print('seed', seed) mpi_print("---------------------------------------") for checkpoint in range(1, 33): with tf.Session() as sess: steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) enjoy_env_sess(sess, checkpoint, overlap)
def setup_and_load(use_cmd_line_args=True, **kwargs): """ Initialize the global config using command line options, defaulting to the values in `config.py`. `use_cmd_line_args`: set to False to ignore command line arguments passed to the program `**kwargs`: override the defaults from `config.py` with these values """ args = Config.initialize_args(use_cmd_line_args=use_cmd_line_args, **kwargs) load_for_setup_if_necessary() return args
def restore_file_back(restore_id, load_key='default'): if restore_id is not None: load_file = Config.get_load_filename(restore_id=restore_id) filepath = file_to_path(load_file) load_data = joblib.load(filepath) Config.set_load_data(load_data, load_key=load_key) restored_args = load_data['args'] sub_dict = {} res_keys = Config.RES_KEYS for key in res_keys: if key in restored_args: sub_dict[key] = restored_args[key] else: print('warning key %s not restored' % key) Config.parse_args_dict(sub_dict) from coinrun.coinrunenv import init_args_and_threads init_args_and_threads(4)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def create_act_model(sess, env, nenvs): load_data = Config.get_load_data('default') create_additional = 'use_minimum_model' not in load_data['args'] ob_space = env.observation_space ac_space = env.action_space policy = policies.get_policy() act = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False, create_additional=create_additional) return act
def load_params_for_scope(sess, scope, load_key='default'): load_data = Config.get_load_data(load_key) if load_data is None: return False params_dict = load_data['params'] if scope in params_dict: print('Loading saved file for scope', scope) loaded_params = params_dict[scope] loaded_params, params = get_savable_params(loaded_params, scope, keep_heads=True) restore_params(sess, loaded_params, params) return True
def step_wait(self): self.buf_rew = np.zeros_like(self.buf_rew) self.buf_done = np.zeros_like(self.buf_done) lib.vec_wait(self.handle, self.buf_rgb, self.buf_render_rgb, self.buf_rew, self.buf_done) obs_frames = self.buf_rgb.astype(np.float32) if Config.USE_BLACK_WHITE: obs_frames = np.mean(obs_frames.astype(np.float32), axis=-1).astype(np.float32)[..., None] if Config.is_test_rank(): obs_frames = slice_spectrum(obs_frames, Config.TEST_SPECTRUM, Config.RADIUS) else: obs_frames = slice_spectrum(obs_frames, Config.TRAIN_SPECTRUM, Config.RADIUS) return obs_frames, self.buf_rew, self.buf_done, self.dummy_info
def save_params_in_scopes(sess, scopes, filename, base_dict=None): data_dict = {} if base_dict is not None: data_dict.update(base_dict) save_path = file_to_path(filename) data_dict['args'] = Config.get_args_dict() param_dict = {} for scope in scopes: params = tf.trainable_variables(scope) if len(params) > 0: print('saving scope', scope, filename) ps = sess.run(params) param_dict[scope] = ps data_dict['params'] = param_dict joblib.dump(data_dict, save_path)
def try_load_model(self): load_data = Config.get_load_data('default') if load_data is None: return False params_dict = load_data['params'] if "model" in params_dict: print('Loading saved file for scope', "model") loaded_params = params_dict["model"] if len(loaded_params) != len(self.params): print('param mismatch', len(loaded_params), len(self.params)) assert (False) restore_ops = [] for p, loaded_p in zip(self.params, loaded_params): restore_ops.append(tf.assign(p, loaded_p)) self.sess.run(restore_ops) return True return False
def load_params_for_scope(sess, scope, load_key='default', load_path=None): if load_path is None: load_data = Config.get_load_data(load_key) else: load_path = file_to_path(load_path) if os.path.exists(load_path): load_data = joblib.load(load_path) print('Load file', load_path) else: load_data = None if load_data is None: return False params_dict = load_data['params'] if scope in params_dict: print('Loading saved file for scope', scope) loaded_params = params_dict[scope] loaded_params, params = get_savable_params(loaded_params, scope, keep_heads=True) restore_params(sess, loaded_params, params) return True
def restore_file(restore_id, base_name=None, overlap_config=None, load_key='default'): """overlap config means you can modify the config in savefile, e.g. test seed""" if restore_id is not None: load_file = Config.get_load_filename(restore_id=restore_id, base_name=base_name) filepath = file_to_path(load_file) assert os.path.exists(filepath), "don't exist" load_data = joblib.load(filepath) Config.set_load_data(load_data, load_key=load_key) restored_args = load_data['args'] sub_dict = {} res_keys = Config.RES_KEYS for key in res_keys: if key in restored_args: sub_dict[key] = restored_args[key] else: print('warning key %s not restored' % key) Config.parse_args_dict(sub_dict) print("Load params") if overlap_config is not None: Config.parse_args_dict(overlap_config) from coinrun.coinrunenv import init_args_and_threads print("Init coinrun env threads and env args") init_args_and_threads(4) if restore_id == None: return None else: return load_file
def save_model(base_name=None): base_dict = {'datapoints': datapoints} utils.save_params_in_scopes(sess, ['model'], Config.get_save_file(base_name=base_name), base_dict)
def create_env( num_envs, *, env_kind="procgen", epsilon_greedy=0.0, reward_scale=1.0, frame_stack=1, use_sticky_actions=0, coinrun_old_extra_actions=0, **kwargs, ): if env_kind == "procgen": env_kwargs = {k: v for k, v in kwargs.items() if v is not None} env_name = env_kwargs.pop("env_name") if env_name == "coinrun_old": import coinrun from coinrun.config import Config Config.initialize_args(use_cmd_line_args=False, **env_kwargs) global coinrun_initialized if not coinrun_initialized: coinrun.init_args_and_threads() coinrun_initialized = True venv = coinrun.make("standard", num_envs) if coinrun_old_extra_actions > 0: venv = VecExtraActions( venv, extra_actions=coinrun_old_extra_actions, default_action=0 ) else: from procgen import ProcgenGym3Env import gym3 env_kwargs = { k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS } env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs) env = gym3.ExtractDictObWrapper(env, "rgb") venv = gym3.ToBaselinesVecEnv(env) elif env_kind == "atari": game_version = "v0" if use_sticky_actions == 1 else "v4" def make_atari_env(lower_env_id, num_env): env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}" def make_atari_env_fn(): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=False, clip_rewards=False) return env return SubprocVecEnv([make_atari_env_fn for i in range(num_env)]) lower_env_id = kwargs["env_id"] venv = make_atari_env(lower_env_id, num_envs) else: raise ValueError(f"Unsupported env_kind: {env_kind}") if frame_stack > 1: venv = VecFrameStack(venv=venv, nstack=frame_stack) if reward_scale != 1: venv = VecRewardScale(venv, reward_scale) venv = VecMonitor(venv=venv, filename=None, keep_buf=100) if epsilon_greedy > 0: venv = EpsilonGreedy(venv, epsilon_greedy) venv = VecShallowCopy(venv) return venv
def __init__(self, comm, **kwargs): self.comm = comm self.train_frac = 1.0 - Config.get_test_frac() tf.train.AdamOptimizer.__init__(self, **kwargs)
def __init__(self, sess): comm = MPI.COMM_WORLD rank = comm.Get_rank() clean_tb_dir() tb_writer = tf.summary.FileWriter( Config.TB_DIR + '/' + Config.RUN_ID + '_' + str(rank), sess.graph) total_steps = [0] should_log = (rank == 0 or Config.LOG_ALL_MPI) if should_log: hyperparams = np.array(Config.get_arg_text()) hyperparams_tensor = tf.constant(hyperparams) summary_op = tf.summary.text("hyperparameters info", hyperparams_tensor) summary = sess.run(summary_op) tb_writer.add_summary(summary) def add_summary(_merged, interval=1): if should_log: total_steps[0] += 1 if total_steps[0] % interval == 0: tb_writer.add_summary(_merged, total_steps[0]) tb_writer.flush() tuples = [] def make_scalar_graph(name): scalar_ph = tf.placeholder(name='scalar_' + name, dtype=tf.float32) scalar_summary = tf.compat.v1.summary.scalar(name, scalar_ph) merged = tf.compat.v1.summary.merge([scalar_summary]) tuples.append((scalar_ph, merged)) name_dict = {} curr_name_idx = [0] def log_scalar(x, name, step=-1): if not name in name_dict: name_dict[name] = curr_name_idx[0] tf_name = (name + '_' + Config.RUN_ID) if curr_name_idx[0] == 0 else name make_scalar_graph(tf_name) curr_name_idx[0] += 1 idx = name_dict[name] scalar_ph, merged = tuples[idx] if should_log: if step == -1: step = total_steps[0] total_steps[0] += 1 _merged = sess.run(merged, {scalar_ph: x}) tb_writer.add_summary(_merged, step) tb_writer.flush() self.add_summary = add_summary self.log_scalar = log_scalar
def setup(**kwargs): Config.merge(kwargs) from coinrun.coinrunenv import init_args_and_threads init_args_and_threads()
def enjoy_env_sess(sess, checkpoint, overlap): #base_name = str(8*checkpoint) + 'M' #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name) should_eval = True mpi_print('test levels seed', Config.SET_SEED) mpi_print('test levels ', Config.NUM_LEVELS) rep_count = 50 env = utils.make_general_env(20) env = wrappers.add_final_wrappers(env) nenvs = env.num_envs sess.run(tf.global_variables_initializer()) args_now = Config.get_args_dict() #args_run = utils.load_args() agent = create_act_model(sess, env, nenvs) # load name is specified by config.RESTORE_ID adn return True/False if checkpoint != 32: base_name = str(8 * checkpoint) + 'M' elif checkpoint == 0: mean_score = 0.0 succ_rate = 0.0 wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) return mean_score, succ_rate else: base_name = None sess.run(tf.global_variables_initializer()) # env init here load_file = setup_utils.restore_file(Config.RESTORE_ID, overlap_config=overlap, base_name=base_name) is_loaded = utils.load_params_for_scope(sess, 'model') if not is_loaded: mpi_print('NO SAVED PARAMS LOADED') return mean_score, succ_rate obs = env.reset() t_step = 0 scores = np.zeros((nenvs, rep_count)) eplens = np.zeros((nenvs, rep_count)) #scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) # curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) def rollout(obs, state, done): """rollout for rep * nenv times and return scores""" t = 0 count = 0 rews = np.zeros((nenvs, rep_count)) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) rews[:, count] += rew t += 1 for i, d in enumerate(done): if d: eplens[i][count] = t if score_counts[i] < rep_count: score_counts[i] += 1 count = score_counts[i] - 1 # aux score if 'episode' in info[i]: scores[i][count] = info[i].get('episode')['r'] return scores, rews, eplens if is_loaded: mpi_print(load_file) scores, rews, eplens = rollout(obs, state, done) size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if size == 1: if rank == 0: testset_size = rep_count * nenvs utils.save_pickle(scores, Config.LOGDIR + 'scores') mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size mpi_print('cpus ', size) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) mpi_print('mean score', mean_score) wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) #mpi_print('mean score of each env',[np.mean(s) for s in scores]) else: testset_size = rep_count * nenvs succ = np.sum(scores=10.0) / testset_size succ_rate = utils.mpi_average([succ]) mean_score_tmp = np.sum(scores) / testset_size mean_score = utils.mpi_average([mean_score_tmp]) if rank == 0: mpi_print('testset size', rep_count * nenvs * size) mpi_print('load file name', load_file) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) mpi_print('mean score', mean_score) wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate}) return mean_score, succ_rate
def main(): # general setup os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # perpare directory sub_dir = utils.file_to_path(Config.get_save_file(base_name="tmp")) if os.path.isdir(sub_dir): shutil.rmtree(path=sub_dir) os.mkdir(sub_dir) # hyperparams nenvs = Config.NUM_ENVS total_timesteps = Config.TIMESTEPS population_size = Config.POPULATION_SIZE timesteps_per_agent = Config.TIMESTEPS_AGENT worker_count = Config.WORKER_COUNT passthrough_perc = Config.PASSTHROUGH_PERC mutating_perc = Config.MUTATING_PERC # create environment def make_env(): env = utils.make_general_env(nenvs, seed=rank) env = wrappers.add_final_wrappers(env) return env # setup session and workers, and therefore tensorflow ops graph = tf.get_default_graph() sess = tf.Session(graph=graph) policy = policies.get_policy() workers = [ Worker(sess, i, nenvs, make_env, policy, sub_dir) for i in range(worker_count) ] tb_writer = TB_Writer(sess) def clean_exit(): for worker in workers: Thread.join(worker.thread) utils.mpi_print("") utils.mpi_print("== total duration", "{:.1f}".format(time.time() - t_first_start), " s ==") utils.mpi_print(" exit...") # save best performing agent population.sort(key=lambda k: k['fit'], reverse=True) workers[0].restore_model(name=population[0]["name"]) workers[0].dump_model() # cleanup sess.close() shutil.rmtree(path=sub_dir) # load data from restore point and seed the whole population loaded_name = None if workers[0].try_load_model(): loaded_name = str(uuid.uuid1()) workers[0].save_model(name=loaded_name) # initialise population # either all random and no mutations pending # or all from restore point with all but one to be mutated population = [{ "name": loaded_name or str(uuid.uuid1()), "fit": -1, "need_mut": loaded_name != None and i != 0, "age": -1, "mean_ep_len": -1 } for i in range(population_size)] utils.mpi_print("== population size", population_size, ", t_agent ", timesteps_per_agent, " ==") t_first_start = time.time() try: # main loop generation = 0 timesteps_done = 0 while timesteps_done < total_timesteps: t_generation_start = time.time() utils.mpi_print("") utils.mpi_print("__ Generation", generation, " __") # initialise and evaluate all new agents for agent in population: #if agent["fit"] < 0: # test/ if True: # test constant reevaluation, to dismiss "lucky runs" -> seems good # pick worker from pool and let it work on the agent not_in_work = True while not_in_work: for worker in workers: if worker.can_take_work(): worker.work(agent, timesteps_per_agent) not_in_work = False break timesteps_done += timesteps_per_agent * nenvs for worker in workers: Thread.join(worker.thread) # sort by fitness population.sort(key=lambda k: k["fit"], reverse=True) # print stuff fitnesses = [agent["fit"] for agent in population] ages = [agent["age"] for agent in population] ep_lens = [agent["mean_ep_len"] for agent in population] utils.mpi_print(*["{:5.3f}".format(f) for f in fitnesses]) utils.mpi_print(*["{:5}".format(a) for a in ages]) utils.mpi_print("__ average fit", "{:.1f}".format( np.mean(fitnesses)), ", t_done", timesteps_done, ", took", "{:.1f}".format(time.time() - t_generation_start), "s", ", total", "{:.1f}".format(time.time() - t_first_start), "s __") # log stuff tb_writer.log_scalar(np.mean(fitnesses), "mean_fit", timesteps_done) tb_writer.log_scalar(np.median(fitnesses), "median_fit", timesteps_done) tb_writer.log_scalar(np.max(fitnesses), "max_fit", timesteps_done) tb_writer.log_scalar(np.mean(ages), "mean_age", timesteps_done) ep_lens_mean = np.nanmean(ep_lens) if (ep_lens_mean): tb_writer.log_scalar(ep_lens_mean, "mean_ep_lens", timesteps_done) # cleanup to prevent disk clutter to_be_removed = set( re.sub(r'\..*$', '', f) for f in os.listdir(sub_dir)) - set( [agent["name"] for agent in population]) for filename in to_be_removed: os.remove(sub_dir + "/" + filename + ".index") os.remove(sub_dir + "/" + filename + ".data-00000-of-00001") # break when times up if not timesteps_done < total_timesteps: break # mark weak agents for replacement cutoff_passthrough = math.floor(population_size * passthrough_perc) cutoff_mutating = math.floor(population_size * mutating_perc) source_agents = population[:cutoff_mutating] new_population = population[:cutoff_passthrough] k = 0 while len(new_population) < population_size: new_agent = { "name": source_agents[k] ["name"], # Take name from source agent, so mutation knows the parent "fit": -1, "need_mut": True, "age": 0 } new_population.append(new_agent) k = (k + 1) % len(source_agents) population = new_population generation += 1 clean_exit() except KeyboardInterrupt: clean_exit() return 0
def main(sess): comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 if Config.EXTRACT_SEED != -1: seed = Config.EXTRACT_SEED if Config.EXTRACT_RANK != -1: rank = Config.EXTRACT_RANK set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 use_policy = (Config.RESTORE_ID != '') nenvs = Config.NUM_ENVS total_timesteps = int(502e6) env = utils.make_general_env(nenvs, seed=rank) if use_policy: agent = create_act_model(sess, env, nenvs) sess.run(tf.compat.v1.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') # make directory DIR_NAME = './VAE/records/' if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME, exist_ok=True) # set file name filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz" with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) obs[:] = env.reset() dones = [False for _ in range(nenv)] # remove noisy inputs actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) obs[:], rewards, dones, _ = env.step(actions) state = agent.initial_state mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[] # For n in range number of steps for _ in range(400): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init if use_policy: actions, _, _, _ = agent.step(obs, state, dones) else: actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_dones.append(dones) # Take actions in env and look the results # Infos contains a ton of useful informations obs[:], rewards, dones, _ = env.step(actions) mb_next_obs.append(obs.copy()) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=obs.dtype) mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones) np.savez_compressed(filename, obs=mb_obs) return filename