def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") ff = open('/home/qingkai/cpo_dual.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64,32) ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) } ) safety_baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }, target_key='safety_returns', ) safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline) algo = CPO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, safety_gae_lambda=0.5, batch_size=100000, max_path_length=500, n_itr=2000, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor':trpo_subsample_factor}, #plot=True, ) algo.train() f.close() ff.close()
def __init__(self, optimizer=None, optimizer_args=None, safety_constraint=None, pdo_vf_mode=1, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: if optimizer_args is None: optimizer_args = dict() optimizer = ConjugateGradientOptimizer(**optimizer_args) pop_keys = [ 'safety_constrained_optimizer', 'safety_tradeoff', 'learn_safety_tradeoff_coeff', 'safety_key' ] for key in pop_keys: if key in kwargs.keys(): kwargs.pop(key) if pdo_vf_mode == 1: # won't be using safety baseline, so key should not be advantages. safety_key = 'returns' else: safety_key = 'advantages' if pdo_vf_mode == 2 and not (hasattr(safety_constraint, 'baseline')): logger.log( "Warning: selected two-VF PDO, without providing VF for safety constraint." ) logger.log("Defaulting to one-VF PDO.") pdo_vf_mode = 1 safety_key = 'returns' super(PDO_OFF, self).__init__(optimizer=optimizer, safety_constrained_optimizer=False, safety_constraint=safety_constraint, safety_tradeoff=True, learn_safety_tradeoff_coeff=True, safety_key=safety_key, pdo_vf_mode=pdo_vf_mode, **kwargs)
def __init__(self, optimizer=None, optimizer_args=None, safety_constrained_optimizer=True, safety_constraint=None, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: if optimizer_args is None: optimizer_args = dict() if safety_constraint is not None and safety_constrained_optimizer: optimizer = ConjugateConstraintOptimizer(**optimizer_args) else: optimizer = ConjugateGradientOptimizer(**optimizer_args) super(TRPO, self).__init__( optimizer=optimizer, safety_constrained_optimizer=safety_constrained_optimizer, safety_constraint=safety_constraint, **kwargs)
def run_task(*_): trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32)) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64, 32), 'hidden_nonlinearity': NL.tanh, 'learn_std': False, 'step_size': trpo_stepsize, 'optimizer': ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }) safety_constraint = GatherSafetyConstraint(max_value=0.1) algo = PDO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=50000, max_path_length=15, n_itr=100, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor': trpo_subsample_factor}, #plot=True, ) algo.train()
def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32)) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64, 32), 'hidden_nonlinearity': NL.tanh, 'learn_std': False, 'step_size': trpo_stepsize, 'optimizer': ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }) safety_constraint = GatherSafetyConstraint(max_value=0.2) ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32)) ddpg_es = OUStrategy(env_spec=env.spec) ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(100, 100)) ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(100, 100)) offline_itr_n = 100000 algo = PDO_OFF( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=20000, max_path_length=15, n_itr=200, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor': trpo_subsample_factor}, ddpg_policy=ddpg_policy, ddpg_qf=ddpg_qf, ddpg_qf_cost=ddpg_qf_cost, ddpg_es=ddpg_es, ddpg_dual_var=0, ddpg_batch_size=64, ddpg_qf_learning_rate=1e-4, ddpg_qf_cost_learning_rate=1e-4, ddpg_dual_learning_rate=1e-3, ddpg_policy_learning_rate=1e-3, ddpg_scale_reward=1, ddpg_scale_cost=1, offline_itr_n=offline_itr_n, balance=0, safety_tradeoff_coeff_lr=1e-2, ddpg_avg_horizon=offline_itr_n, adjust_epoch=5, ddpg_qf_weight_decay=0., #plot=True, ) algo.train() f.close()