def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs) return model, env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) ####################################################### #in default one more normalization_observations=True!!!! ################################################################ ########!!!arguments for build_env need to be adjusted according to train+eval details later env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) ######################################################## if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #set_global_seeds(workerseed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False, normalize_ret=False) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) #timesteps_per_batch=1024 #timesteps_per_batch=2048 beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=seed, beta=beta, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs) return model, env