def train(self, num_frames, seed, policy, lrschedule, num_cpu): def make_env(rank): def _thunk(): env = helpers.get_env_wrapper(render=FLAGS.render) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy self.learn(policy_fn, env, seed, total_timesteps=num_frames, lrschedule=lrschedule, nsteps=(1 if FLAGS.render else 5), lr=FLAGS.lr) env.close()
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def train(env_id, save_name, num_timesteps, seed, policy, lrschedule, sil_update, sil_beta, num_env): policy_fn = CnnPolicy_grid # env_args = {'episode_life': False, 'clip_rewards': False} env = gym.make(env_id) obs = env.reset() ob_space = obs["image"].shape ac_space = env.action_space env.close() # print(env.observation_space) print('ob_space:', ob_space) print('num act:', ac_space) envs = [make_env(env_id, seed, i) for i in range(num_env)] if num_env > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # print('obs_shape_stack:',obs_shape) learn(policy_fn, envs, seed, ob_space, ac_space, save_name=save_name, nsteps=5, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, lr=7e-4) envs.close()
def show(env_id, num_timesteps, seed, policy): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) #gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(1)], render=True) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy elif policy == 'autoencoder': policy_fn = AutoencoderPolicy if policy == 'autoencoder': auto_a2c.enjoy(policy_fn, policy, env, seed, num_timesteps) else: a2c.enjoy(policy_fn, policy, env, seed, num_timesteps) env.close()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = AcerCnnPolicy elif policy == 'lstm': policy_fn = AcerLstmPolicy else: print("Policy {} not implemented".format(policy)) return learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy elif policy == 'autoencoder': policy_fn = AutoencoderPolicy if policy == 'autoencoder': auto_a2c.learn(policy_fn, policy, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) else: a2c.learn(policy_fn, policy, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_frames, seed, load_path, num_cpu): num_timesteps = int(num_frames // 4) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, load_path=load_path, nprocs=num_cpu) env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env = gym.wrappers.Monitor( env, directory='/home/vasu/Desktop/acktr_json', force=True, video_callable=False, write_upon_reset=True) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu, perform, use_expert, save_networks, learn_time, expert_buffer_size): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = AcerCnnPolicy elif policy == 'lstm': policy_fn = AcerLstmPolicy else: print("Policy {} not implemented".format(policy)) return network_saving_dir = os.path.join('./saved_networks', env_id)+'/' if not os.path.exists(network_saving_dir): os.makedirs(network_saving_dir) learn(policy_fn, env, seed, env_id, learn_time, expert_buffer_size, perform, use_expert, save_networks, network_saving_dir, int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'radlstm': policy_fn = RadLstmPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): def make_env(rank): def env_fn(): print(rank) if num_cpu == 1: env = MarioEnv(num_steering_dir=0) else: env = MarioEnv(num_steering_dir=11, num_env=rank) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return env return env_fn set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = AcerCnnPolicy elif policy == 'lstm': policy_fn = AcerLstmPolicy else: print("Policy {} not implemented".format(policy)) return learn(policy_fn, env, seed, nsteps=50, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, buffer_size=15000, gamma=0.95) env.close()
def train(num_timesteps, seed, num_cpu): # TODO: Just f****n ugly handle that better def make_env(rank): def _thunk(): print(rank) if num_cpu == 1: env = MarioEnv(num_steering_dir=11) else: env = MarioEnv(num_steering_dir=11, num_env=rank) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = OurAcktrPolicy learn(policy_fn, env, seed, nsteps=4, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, save_interval=10) env.close()
def train(make_env, num_timesteps, seed, policy, lrschedule, num_cpu, vf_coef=0.5, ent_coef=0.01): def _make_env(rank): def _thunk(): env = make_env() env.seed(seed + rank) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([_make_env(i) for i in range(num_cpu)]) learn(policy, env, seed, nstack=1, total_timesteps=num_timesteps, lrschedule=lrschedule, vf_coef=vf_coef, ent_coef=ent_coef) env.close()
def train(): env_args = dict(map_name=FLAGS.map_name, step_mul=FLAGS.step_mul, game_steps_per_episode=0, screen_size_px=(FLAGS.resolution, ) * 2, minimap_size_px=(FLAGS.resolution, ) * 2, visualize=FLAGS.visualize) envs = SubprocVecEnv( [partial(make_sc2env, id=i, **env_args) for i in range(FLAGS.n_envs)]) policy_fn = FullyConvPolicy try: learn( policy_fn, envs, seed=1, total_timesteps=int(1e6) * FLAGS.frames, lrschedule=FLAGS.lrschedule, nstack=1, #must be 1 for FullyConvPolicy above ent_coef=FLAGS.entropy_weight, vf_coef=FLAGS.value_weight, max_grad_norm=1.0, lr=FLAGS.learning_rate) except KeyboardInterrupt: pass envs.close()
def train(env_id, num_frames, seed, num_cpu, save_interval, ckpt_dir): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) # check to ensure full action space is used assert env.action_space.n == 18, "amount of actions in action space is :{}, not equal to full action space".format( env.action_space.n) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, save_interval=save_interval, ckpt_dir=ckpt_dir) env.close()
def train(env_id, num_frames, seed, nsteps, policy, lrschedule, num_cpu, model_path, lr=7e-4, pg_coef=1.0, ent_coef=0.01, vf_coef=0.5): num_timesteps = int(num_frames / 4) # divide by 4 due to frameskip def make_env(rank, isTraining=True): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank)), allow_early_resets=(not isTraining)) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env, episode_life=isTraining, clip_rewards=isTraining) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i, isTraining=True) for i in range(num_cpu)]) eval_env = SubprocVecEnv( [make_env(num_cpu + i, isTraining=False) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, eval_env, seed, nsteps=nsteps, total_timesteps=num_timesteps, lr=lr, pg_coef=pg_coef, ent_coef=ent_coef, vf_coef=vf_coef, lrschedule=lrschedule, model_path=model_path) eval_env.close() env.close()
def train(env_id, policy_fn, num_timesteps, seed, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay): records = [] envs = [make_env(rank = i) for i in range(num_processes)] replaybuffer = Buffer() if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) cumpute_loss = loss_dict[model_name] optimizer = torch.optim.Adam(model.parameters()) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done)) for i, d in enumerate(done): if d: records.append((t * num_processes + i, returns[i])) if i==0: print(returns[0]) returns[i] = 0 state = next_state if t % 500//num_processes == (500//num_processes-1): for _ in range(epoch): optimizer.zero_grad() loss = cumpute_loss(replaybuffer.sample(), model) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() if model_name == 'PPO' or model_name == 'DPPO': replaybuffer.clear() if t % (num_env_steps//num_processes//10) == 0: i = t//(num_env_steps//num_processes//10) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt')) if use_linear_clip_decay: update_linear_schedule(optimizer, t * num_processes) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) timesteps , sumofrewards = zip(*records) savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards}) except Exception as e: traceback.print_exc() finally: envs.close()
def train_acktr(env_id, num_timesteps, seed, num_cpu): """Train a acktr model. Parameters ------- env_id: environment to train on num_timesteps: int number of env steps to optimizer for seed: int number of random seed num_cpu: int number of parallel agents """ num_timesteps //= 4 def make_env(rank): def _thunk(): # 1. Create gym environment env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy acktr_disc.learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, save_interval=True, lr=FLAGS.lr, callback=acktr_callback) env.close()
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def train(): # Fetch the requested environment set in flags. env_class = attrgetter(FLAGS.env)(sc2g.env) env_args = dict( map_name=FLAGS.map_name, feature_screen_size=FLAGS.screen_size, feature_minimap_size=FLAGS.minimap_size, visualize=FLAGS.visualize, save_replay_episodes=FLAGS.save_replay_episodes, replay_dir=FLAGS.replay_dir, ) envs = SubprocVecEnv([ partial(env_class.make_env, id=i, **env_args) for i in range(FLAGS.envs) ]) policy_fn = CnnPolicy if FLAGS.policy == 'cnn': policy_fn = CnnPolicy elif FLAGS.policy == 'lstm': policy_fn = LstmPolicy elif FLAGS.policy == 'lnlstm': policy_fn = LnLstmPolicy elif FLAGS.policy == 'fullyconv': policy_fn = FullyConvPolicy else: print("Invalid policy function! Defaulting to {}.".format(policy_fn)) try: learn(policy_fn, envs, seed=1, total_timesteps=int(1e6 * FLAGS.max_timesteps), lrschedule=FLAGS.lrschedule, ent_coef=FLAGS.entropy_weight, vf_coef=FLAGS.value_weight, max_grad_norm=1.0, lr=FLAGS.learning_rate) except KeyboardInterrupt: pass print("Closing environment...") envs.close()
class EnvironmentContext: def __init__(self, *, env_name=None, make_env=None, seed, n_envs=1, env_modifiers=list(), vec_env_modifiers=list()): self.env_name = env_name if make_env is None: make_env = lambda: gym.make(self.env_name) self.make_env = make_env self.n_envs = n_envs self.env_modifiers = env_modifiers self.vec_env_modifiers = vec_env_modifiers self.seed = seed def __enter__(self): def make_env(i): def _thunk(): env = self.make_env() env.seed(i) for fn in self.env_modifiers: env = fn(env) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env return _thunk set_global_seeds(self.seed) self.base_vec_env = SubprocVecEnv( [make_env(i + self.seed) for i in range(self.n_envs)]) self.environments = self.base_vec_env for fn in self.vec_env_modifiers: self.environments = fn(self.environments) return self def __exit__(self, *args): self.base_vec_env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env_spec = gym.spec('ppaquette/DoomBasic-v0') env_spec.id = 'DoomBasic-v0' env = env_spec.make() env.seed(seed + rank) env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env)))) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return ScaleRewardEnv(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, nstack=1) env.close()
def train(env_id, num_timesteps, seed, num_cpu, save_interval=None, animate_interval=None): def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) #env = NoopResetEnv(env, noop_max=30) #env = MaxAndSkipEnv(env, skip=4) #if 'FIRE' in env.unwrapped.get_action_meanings(): # env = FireResetEnv(env) #env = WarpFrame(env) env = ClipRewardEnv(env) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = FcnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, save_interval=save_interval, animate_interval=animate_interval, env_id=env_id) env.close()
def test_env_after_learn(algo): def make_env(): env = gym.make('PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) network = cnn(one_dim_bias=True) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network=network, env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(num_timesteps, seed, lrschedule, num_cpu): def make_env(rank): def _thunk(): env = LearnToRunEnv(difficulty=(seed + rank) % 3) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = MlpPolicy learn(policy_fn, env, seed, nsteps=5, nstack=2, total_timesteps=num_timesteps,\ gamma=0.995, vf_coef=0.5,ent_coef=0.0001,max_grad_norm=args.max_grad_norm, lr=args.lr,lrschedule='linear') #constant does much worse) env.close()
def main(): """ Example program using SubProcVecEnv """ num_envs = 2 env_name = 'BreakoutNoFrameskip-v4' env = SubprocVecEnv([ lambda: env_instantiate_fn(env_name, seed) for seed in range(num_envs) ]) obs = env.reset() print("After reset:") print(obs.shape) obs, rews, dones, infos = env.step([0, 0]) print("After first action:") print(obs.shape) print(rews) print(dones) print(infos) obs, rews, dones, infos = env.step([1, 0]) print("After second action:") print(obs.shape) print(rews) print(dones) print(infos) obs, rews, dones, infos = env.step([0, 1]) print("After third action:") print(obs.shape) print(rews) print(dones) print(infos) env.close()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.close()
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir): records = [] epoch = 0 envs = [make_env(rank = i) for i in range(num_processes)] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) model.load_state_dict(state_dict) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward for i, d in enumerate(done): if d: records.append(returns[i]) returns[i] = 0 epoch += 1 if epoch >= 100: break state = next_state records = np.array(records) print("# of epoch: {0}".format(epoch)) print("mean: {0}".format(np.mean(records))) print("std: {0}".format(np.std(records))) print("max: {0}".format(np.max(records))) print("min: {0}".format(np.min(records))) print("median: {0}".format(np.median(records))) except Exception as e: traceback.print_exc() finally: envs.close()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env_spec = gym.spec('ppaquette/DoomBasic-v0') env_spec.id = 'DoomBasic-v0' env = env_spec.make() env.seed(seed + rank) env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env)))) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return ScaleRewardEnv(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule, lr=1e-4, nsteps=10, nstack=1) env.close()