def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env = gym.wrappers.Monitor( env, directory='/home/vasu/Desktop/acktr_json', force=True, video_callable=False, write_upon_reset=True) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def train(env_id, num_frames, seed, num_cpu, save_interval, ckpt_dir): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) # check to ensure full action space is used assert env.action_space.n == 18, "amount of actions in action space is :{}, not equal to full action space".format( env.action_space.n) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, save_interval=save_interval, ckpt_dir=ckpt_dir) env.close()
def train(env_id, num_frames, seed, load_path, num_cpu): num_timesteps = int(num_frames // 4) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, load_path=load_path, nprocs=num_cpu) env.close()
def train(num_timesteps, seed, num_cpu): # TODO: Just f****n ugly handle that better def make_env(rank): def _thunk(): print(rank) if num_cpu == 1: env = MarioEnv(num_steering_dir=11) else: env = MarioEnv(num_steering_dir=11, num_env=rank) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = OurAcktrPolicy learn(policy_fn, env, seed, nsteps=4, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, save_interval=10) env.close()
def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = partial(CnnPolicy, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, policy_fn, num_timesteps, seed, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, num_timesteps, seed, num_cpu, num_env): env = VecFrameStack( # make_atari_env(env_id, num_cpu, seed), make_distributed_env(env_id, num_env, seed), # make_old_dist_env(env_id, num_env, seed), 4) policy_fn = partial(CnnPolicy, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env_spec = gym.spec('ppaquette/DoomBasic-v0') env_spec.id = 'DoomBasic-v0' env = env_spec.make() env.seed(seed + rank) env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env)))) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return ScaleRewardEnv(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, nstack=1) env.close()
def train(params): policy_fn = CnnPolicy dataflow_config = { 'future_rewards': True, # Should return future discounted rewards? 'exclude_zero_actions': False, # Should exclude zero actions 'remap_actions': False, # Should remap to smaller action set? 'clip_rewards': True, # Clip rewards to [-1, 1] 'monte-specific-blackout': True, # Cover up score and lives indicators 'pong-specific-blackout': False, # Cover up scores in pong 'gamma': params.gamma, # reward discount factor 'frame_history': 4, # What is minimum number of expert frames since beginning of episode? 'frameskip': 4, # frameskip 'preload_images': True, # Preload images from hard drive or keep reloading ? 'gdrive_data_id': cnst.MONTE_DATA_GDRIVE_ID, 'data_dir': cnst.DATA_DIR, 'img_dir': cnst.MIKE_IMG_DIR, 'traj_dir': cnst.MIKE_TRAJECTORIES_DIR, 'stat_dir': cnst.MIKE_STATES_DIR, 'batch_size': params.expert_nbatch, 'max_score_cutoff': params.exp_max_score, # What is maximum expert score we can show? Used to cut expert data 'min_score_cutoff': 20000, # What is minimum score to count trajectory as expert 'process_lost_lifes': True, # Should loss of life zero future discounted reward? 'use_n_trajectories': params.use_n_trajectories if 'use_n_trajectories' in params else None } the_seed = np.random.randint(10000) print(80 * "SEED") print("Today's lucky seed is {}".format(the_seed)) print(80 * "SEED") env = VecFrameStack( make_atari_env( env_id=params.env, num_env=params.num_env, seed=the_seed, limit_len=params.limit_len, limit_penalty=params.limit_penalty, death_penalty=params.death_penalty, step_penalty=params.step_penalty, random_state_reset=params.random_state_reset, dataflow_config=dataflow_config ), params.frame_stack ) learn( policy=policy_fn, env=env, seed=the_seed, params=params, dataflow_config=dataflow_config, expert_nbatch=params.expert_nbatch, exp_adv_est=params.exp_adv_est, load_model=params.load_model, gamma=params.gamma, nprocs=params.num_env, nsteps=params.nsteps, ent_coef=params.ent_coef, expert_coeff=params.exp_coeff, lr=params.lr, lrschedule=params.lrschedule, ) env.close()
def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()