def train(env_id, num_timesteps, seed):
    from baselines.rac import mlp_policy, rac_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_gym_control_env(env_id, seed)
    test_env = make_gym_control_env(env_id, seed)
    rac_simple.learn(env,
                     test_env,
                     policy_fn,
                     max_timesteps=num_timesteps,
                     timesteps_per_actorbatch=2048,
                     clip_param=0.2,
                     entcoeff=0.0,
                     optim_epochs=10,
                     optim_stepsize=2e-4,
                     optim_batchsize=64,
                     gamma=0.99,
                     lam=0.95,
                     shift=0,
                     schedule='linear')
    env.close()
    test_env.close()
def train(env_id, num_timesteps, seed):
    from baselines.dual_rac import mlp_policy, rac_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_gym_control_env(env_id, seed)
    rac_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=1,
        optim_stepsize=4e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        rho=0.95,  # Gradient weighting factor
        update_step_threshold=3,  # Updating step threshold
        shift=0,
        schedule='linear')
    env.close()
示例#3
0
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 32
    gensize = 100000
    alpha = 0.01
    sigma = 0.1
    eval_iters = 1
    from baselines.openai_es import mlp_policy, es_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    base_env = make_gym_control_env(env_id, seed)
    es_simple.learn(
        base_env,
        policy_fn,
        max_fitness=
        max_fitness,  # has to be negative, as cmaes consider minization
        popsize=popsize,
        gensize=gensize,
        sigma=sigma,
        alpha=alpha,
        eval_iters=eval_iters,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        seed=seed)
    base_env.close()
示例#4
0
def main():
    """
    Train on CartPole.
    """

    args = gym_ctrl_arg_parser().parse_args()

    logger.configure(format_strs=['stdout', 'log', 'csv'], log_suffix = "UberGA-"+args.env+"_seed_"+str(args.seed))
    logger.log("Algorithm:UberGA-" + args.env + "_seed_" + str(args.seed))
    env_id = args.env
    seed = args.seed
    generation = 0
    with make_session() as sess:
        env = make_gym_control_env(env_id, seed)
        try:
            model = simple_mlp(sess, env)
            sess.run(tf.global_variables_initializer())
            learn_sess = LearningSession(sess, model)
            while True:
                if generation >= 10000 or learn_sess.timesteps_so_far >= 5e6:
                    break
                pop = learn_sess.generation(env, trials=1, population=POPULATION)
                generation+=1
        finally:
            env.close()
示例#5
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    # if rank == 0:
    #     logger.configure()
    # else:
    #     logger.configure(format_strs=[])
    #     logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    env = make_gym_control_env(env_id, seed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
示例#6
0
def train(env_id, num_timesteps, seed):
    max_fitness = -10000
    popsize = 32
    gensize = 30 # gen size for each iteration
    bounds = [-5.0, 5.0]
    sigma = 0.1
    eval_iters = 3
    from baselines.cmaes_layer_entire import mlp_policy, cmaes_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2)

    base_env = make_gym_control_env(env_id, seed)
    cmaes_simple.learn(base_env,
                       policy_fn,
                       max_fitness = max_fitness,  # has to be negative, as cmaes consider minization
                       popsize = popsize,
                       gensize = gensize,
                       bounds = bounds,
                       sigma = sigma,
                       eval_iters = eval_iters,
                       max_timesteps=num_timesteps,
                       timesteps_per_actorbatch=2048,
                       seed=seed)
    base_env.close()
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 32
    gensize = 20  # gen size for each iteration
    bounds = [-5.0, 5.0]
    max_v_train_iter = 10
    sigma = 0.01
    eval_iters = 1
    from baselines.ppo_cmaes_surrogate2_Q import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_gym_control_env(env_id, seed)
    test_env = make_gym_control_env(env_id, seed)
    pposgd_simple.learn(
        env,
        test_env,
        policy_fn,
        max_fitness=
        max_fitness,  # has to be negative, as cmaes consider minization
        popsize=popsize,
        gensize=gensize,
        bounds=bounds,
        sigma=sigma,
        eval_iters=eval_iters,
        max_v_train_iter=max_v_train_iter,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        seed=seed,
        env_id=env_id)
    env.close()
    test_env.close()
示例#8
0
def train(env_id, num_timesteps, seed):
    from baselines.ars import ars
    main_loop_size = 1000
    horizon = 1000
    step_size = 0.03
    noise = 0.03
    hp = ars.Hp(main_loop_size, horizon, num_timesteps, step_size, noise)
    set_global_seeds(seed)
    env = make_gym_control_env(env_id, seed)
    # env = wrappers.Monitor(env, monitor_dir, force=True)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    policy = ars.Policy(num_inputs, num_outputs, hp)
    normalizer = ars.Ob_Normalizer(num_inputs)
    ars.train(env, policy, normalizer, hp)
    env.close()
示例#9
0
def train(env_id, num_timesteps, seed):
    env = make_gym_control_env(env_id, seed)
    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env, policy=policy, vf=vf,
            gamma=0.99, lam=0.97, timesteps_per_batch=2500,
            desired_kl=0.002,
            num_timesteps=num_timesteps, animate=False)

        env.close()