示例#1
0
def run_task(*_):
        f = open('/home/qingkai/verina.csv', "w+")
        ff = open('/home/qingkai/cpo_dual.csv', "w+")
        trpo_stepsize = 0.01
        trpo_subsample_factor = 0.2
        
        env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6)

        policy = GaussianMLPPolicy(env.spec,
                    hidden_sizes=(64,32)
                 )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    }
        )

        safety_baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    },
            target_key='safety_returns',
            )

        safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline)



        algo = CPO(
            env=env,
            policy=policy,
            baseline=baseline,
            safety_constraint=safety_constraint,
            safety_gae_lambda=0.5,
            batch_size=100000,
            max_path_length=500,
            n_itr=2000,
            gae_lambda=0.95,
            discount=0.995,
            step_size=trpo_stepsize,
            optimizer_args={'subsample_factor':trpo_subsample_factor},
            #plot=True,
        )

        algo.train()

        f.close()
        ff.close()
示例#2
0
def run_task(*_):

    f = open('/home/qingkai/ddpg_performance.csv', "w+")

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    qf_cost = ContinuousMLPQFunction(env_spec=env.spec)

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    algo = PDO_DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        qf_cost=qf_cost,
        dual_var=0,
        safety_constraint=safety_constraint,
        batch_size=64,
        max_path_length=15,
        epoch_length=10000,
        min_pool_size=10000,
        n_epochs=150,
        discount=0.99,
        qf_learning_rate=1e-3,
        qf_cost_learning_rate=1e-3,
        dual_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        scale_reward=1,
        scale_cost=5,
        soft_target=True,
        soft_target_tau=0.001,
        eval_samples=10000,
        qf_weight_decay=0.,
        qf_cost_weight_decay=0.,
        avg_horizon=100000,
        #plot=True,
    )

    algo.train()
    f.close()
示例#3
0
def run_task(*_):
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.1)

    algo = PDO(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=50000,
        max_path_length=15,
        n_itr=100,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        #plot=True,
    )

    algo.train()
示例#4
0
def run_task(*_):
    f = open('/home/qingkai/verina.csv', "w+")
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=(64, 32))

    ddpg_es = OUStrategy(env_spec=env.spec)

    ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=(100, 100))
    ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec,
                                          hidden_sizes=(100, 100))

    offline_itr_n = 100000

    algo = PDO_OFF(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=20000,
        max_path_length=15,
        n_itr=200,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        ddpg_policy=ddpg_policy,
        ddpg_qf=ddpg_qf,
        ddpg_qf_cost=ddpg_qf_cost,
        ddpg_es=ddpg_es,
        ddpg_dual_var=0,
        ddpg_batch_size=64,
        ddpg_qf_learning_rate=1e-4,
        ddpg_qf_cost_learning_rate=1e-4,
        ddpg_dual_learning_rate=1e-3,
        ddpg_policy_learning_rate=1e-3,
        ddpg_scale_reward=1,
        ddpg_scale_cost=1,
        offline_itr_n=offline_itr_n,
        balance=0,
        safety_tradeoff_coeff_lr=1e-2,
        ddpg_avg_horizon=offline_itr_n,
        adjust_epoch=5,
        ddpg_qf_weight_decay=0.,
        #plot=True,
    )

    algo.train()
    f.close()