def train(args, extra_args): env_type, env_id = run.get_env_type(args.env) if args.alg == 'gail': env_type += '_gail' args.alg = 'bgail' elif args.alg not in ['bgail', 'gail']: raise NotImplementedError learn = run.get_learn_function(args.alg) alg_kwargs = run.get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args) logger.configure(os.path.join("log", "GAIL", args.env, "subsample_{}".format(extra_args["data_subsample_freq"]), "traj_{}".format(extra_args["num_expert_trajs"]), "batch_size_{}".format(extra_args["timesteps_per_batch"]), "seed_{}".format(args.seed))) print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=args.seed, save_path=args.save_path, load_path=args.load_path, render=args.render, **alg_kwargs)
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs) return model, env
def main(): args = parse_args() format_strs = ['log', 'csv', 'stdout'] if args.tensorboard: format_strs.append('tensorboard') config = parse_config(args.config) outdir = os.path.join(args.outdir, os.path.splitext(os.path.basename(args.config))[0]) logger.configure(dir=outdir, format_strs=format_strs) env_type, env_id = get_env_type(GAME_ENVIRONMENT) env = make_vec_env(env_id, env_type, 1, args.seed) model = trpo_mpi.learn(env=env, network=NETWORK_ARCHITECTURE, total_timesteps=args.total_timesteps, **config) env.close() if args.save: model.save(os.path.join(outdir, 'model'))
def build_env(args): env_type, env_id = run.get_env_type(args.env) if env_type in ['mujoco', 'classic_control']: env = gym.make(env_id) env.seed(args.seed) else: raise NotImplementedError return env
def build_env(env_name, num_env=1, seed=None): env_type, env_id = get_env_type(env_name) env_ = make_vec_env(env_id, env_type, num_env, seed) if env_type == 'mujoco': env_ = VecNormalize(env_) return env_
def __init__(self, agent): self.agent = agent env_type, env_id = get_env_type(self.agent.env_name) self.nenv = self.agent.nenv self.env = make_vec_env(env_id, env_type, self.nenv, self.agent.seed, reward_scale=1) self.reset()
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) ####################################################### #in default one more normalization_observations=True!!!! ################################################################ ########!!!arguments for build_env need to be adjusted according to train+eval details later env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) ######################################################## if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def train(args, extra_args): env_type, env_id = run.get_env_type(args.env) if args.alg == 'gail': env_type += '_gail' args.alg = 'bgail' elif args.alg not in ['bgail', 'gail']: raise NotImplementedError learn = run.get_learn_function(args.alg) alg_kwargs = run.get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=args.seed, save_path=args.save_path, load_path=args.load_path, render=args.render, **alg_kwargs)
'num_env': 1, 'nsteps': 2048, 'noptepochs': 10, 'save_interval': 20, 'log_interval': 1, 'save_path': save_path, 'model_load_path': model_load_path, 'seed': 0, 'reward_scale': 1, 'flatten_dict_observations': True, 'transfer_weights': False } args = SimpleNamespace(**args_dict) # Prepare the environment and learning algorithm env_type, env_id = get_env_type(args.env) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) env = build_env(args) alg_kwargs['network'] = args.network # The path we will store the results of this experiment full_path = args.save_path + '/' + args.env + '-' + args.alg # Make folders that we will store the checkpoints, models and epoch results if not os.path.exists(full_path): os.makedirs(full_path) os.makedirs(full_path + '/checkpoints') print("About to start learning model")
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #set_global_seeds(workerseed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False, normalize_ret=False) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) #timesteps_per_batch=1024 #timesteps_per_batch=2048 beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=seed, beta=beta, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs) return model, env
def make_env(env_name, nenv=1, seed=132, debug=True): if debug: return GymEnvWrapperDebug(env_name) env_type, env_id = get_env_type(env_name) env = make_vec_env(env_id, env_type, nenv, seed, reward_scale=1) return env