def train(env_id, num_timesteps, seed): from baselines.rac import mlp_policy, rac_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_gym_control_env(env_id, seed) test_env = make_gym_control_env(env_id, seed) rac_simple.learn(env, test_env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=2e-4, optim_batchsize=64, gamma=0.99, lam=0.95, shift=0, schedule='linear') env.close() test_env.close()
def train(env_id, num_timesteps, seed): from baselines.dual_rac import mlp_policy, rac_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_gym_control_env(env_id, seed) rac_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=1, optim_stepsize=4e-4, optim_batchsize=64, gamma=0.99, lam=0.95, rho=0.95, # Gradient weighting factor update_step_threshold=3, # Updating step threshold shift=0, schedule='linear') env.close()
def train(env_id, num_timesteps, seed): max_fitness = -100000 popsize = 32 gensize = 100000 alpha = 0.01 sigma = 0.1 eval_iters = 1 from baselines.openai_es import mlp_policy, es_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) base_env = make_gym_control_env(env_id, seed) es_simple.learn( base_env, policy_fn, max_fitness= max_fitness, # has to be negative, as cmaes consider minization popsize=popsize, gensize=gensize, sigma=sigma, alpha=alpha, eval_iters=eval_iters, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, seed=seed) base_env.close()
def main(): """ Train on CartPole. """ args = gym_ctrl_arg_parser().parse_args() logger.configure(format_strs=['stdout', 'log', 'csv'], log_suffix = "UberGA-"+args.env+"_seed_"+str(args.seed)) logger.log("Algorithm:UberGA-" + args.env + "_seed_" + str(args.seed)) env_id = args.env seed = args.seed generation = 0 with make_session() as sess: env = make_gym_control_env(env_id, seed) try: model = simple_mlp(sess, env) sess.run(tf.global_variables_initializer()) learn_sess = LearningSession(sess, model) while True: if generation >= 10000 or learn_sess.timesteps_so_far >= 5e6: break pop = learn_sess.generation(env, trials=1, population=POPULATION) generation+=1 finally: env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() # if rank == 0: # logger.configure() # else: # logger.configure(format_strs=[]) # logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_gym_control_env(env_id, seed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): max_fitness = -10000 popsize = 32 gensize = 30 # gen size for each iteration bounds = [-5.0, 5.0] sigma = 0.1 eval_iters = 3 from baselines.cmaes_layer_entire import mlp_policy, cmaes_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) base_env = make_gym_control_env(env_id, seed) cmaes_simple.learn(base_env, policy_fn, max_fitness = max_fitness, # has to be negative, as cmaes consider minization popsize = popsize, gensize = gensize, bounds = bounds, sigma = sigma, eval_iters = eval_iters, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, seed=seed) base_env.close()
def train(env_id, num_timesteps, seed): max_fitness = -100000 popsize = 32 gensize = 20 # gen size for each iteration bounds = [-5.0, 5.0] max_v_train_iter = 10 sigma = 0.01 eval_iters = 1 from baselines.ppo_cmaes_surrogate2_Q import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_gym_control_env(env_id, seed) test_env = make_gym_control_env(env_id, seed) pposgd_simple.learn( env, test_env, policy_fn, max_fitness= max_fitness, # has to be negative, as cmaes consider minization popsize=popsize, gensize=gensize, bounds=bounds, sigma=sigma, eval_iters=eval_iters, max_v_train_iter=max_v_train_iter, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', seed=seed, env_id=env_id) env.close() test_env.close()
def train(env_id, num_timesteps, seed): from baselines.ars import ars main_loop_size = 1000 horizon = 1000 step_size = 0.03 noise = 0.03 hp = ars.Hp(main_loop_size, horizon, num_timesteps, step_size, noise) set_global_seeds(seed) env = make_gym_control_env(env_id, seed) # env = wrappers.Monitor(env, monitor_dir, force=True) num_inputs = env.observation_space.shape[0] num_outputs = env.action_space.shape[0] policy = ars.Policy(num_inputs, num_outputs, hp) normalizer = ars.Ob_Normalizer(num_inputs) ars.train(env, policy, normalizer, hp) env.close()
def train(env_id, num_timesteps, seed): env = make_gym_control_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()