def __init__(self, gym_env_id: str, name: str = None): """ :param gym_env_id: gym environment id :type gym_env_id: str :param name: name of the gym environment instance :type name: str """ super().__init__(name=name if name else gym_env_id) self.env_id = gym_env_id try: self._gym_env = gym.make(gym_env_id) except gym_error.UnregisteredEnv: raise ValueError( 'Env id: {} is not supported currently'.format(gym_env_id)) self._gym_env = gym.make(gym_env_id) self.action_space = space_converter(self._gym_env.action_space) self.observation_space = space_converter( self._gym_env.observation_space) if isinstance(self.action_space, garage_space.Box): self.action_space.low = np.nan_to_num(self.action_space.low) self.action_space.high = np.nan_to_num(self.action_space.high) self.action_space.sample = types.MethodType( self._sample_with_nan, self.action_space) if isinstance(self.observation_space, garage_space.Box): self.observation_space.low = np.nan_to_num( self.observation_space.low) self.observation_space.high = np.nan_to_num( self.observation_space.high) self.observation_space.sample = types.MethodType( self._sample_with_nan, self.observation_space) self.env_spec = EnvSpec(obs_space=self.observation_space, action_space=self.action_space) self.reward_range = self._gym_env.reward_range
def test_correctness(self): env_id = 'Pendulum-v0' env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) dyna = DebugDynamics(env_spec=env_spec) dyna = DynamicsEnvWrapper(dynamics=dyna) dyna.set_terminal_reward_func(terminal_func=RandomTerminalFunc(), reward_func=DebuggingCostFunc()) policy = iLQRPolicy(env_spec=env_spec, T=10, delta=0.05, iteration=2, dynamics=dyna, dynamics_model_train_iter=10, cost_fn=DebuggingCostFunc()) st = env.reset() dyna.st = np.zeros_like(st) for i in range(10): ac = policy.forward(st) st, _, _, _ = env.step(st) # st = dyna.step(action=ac, state=st) print("analytical optimal action -0.5, cost -0.25") print('state: {}, action: {}, cost {}'.format( st, ac, policy.iLqr_instance.cost_fn(state=st, action=ac, new_state=None)))
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done: a.append(tmp_traj.get_copy()) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10)
def create_mlp_v(self, env_id='Pendulum-v0', name='mlp_v'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "L1_NORM": 0.01, "L2_NORM": 0.01, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) return mlp_v, locals()
def test_prior_eval(self): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train gmm model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gmm = GaussianMixtureDynamicsPrior(env_spec=env_spec, batch_data=data) gmm.init() gmm.update(batch_data=data) mu0, Phi, m, n0 = gmm.eval(batch_data=data) state_shape = data.state_set.shape[1] action_shape = data.action_set.shape[1] self.assertEqual(state_shape + action_shape + state_shape, mu0.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[0]) self.assertEqual(state_shape + action_shape + state_shape, Phi.shape[1])
def test_mlp_deterministic_policy(self): env = make('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, locals = self.create_mlp_deterministic_policy( name='mlp_policy', env_spec=env_spec) policy.init() for _ in range(10): ac = policy.forward(obs=env.observation_space.sample()) self.assertTrue(env.action_space.contains(ac[0])) p2 = policy.make_copy(name='test', name_scope='test', reuse=False) p2.init() self.assertGreater(len(policy.parameters('tf_var_list')), 0) self.assertGreater(len(p2.parameters('tf_var_list')), 0) for var1, var2 in zip(policy.parameters('tf_var_list'), p2.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertNotEqual(id(var1), id(var2)) p3 = policy.make_copy(name='mlp_policy_2', name_scope='mlp_policy', reuse=True) p3.init() self.assertGreater(len(p3.parameters('tf_var_list')), 0) for var1, var2 in zip(policy.parameters('tf_var_list'), p3.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertEqual(id(var1), id(var2))
def create_continue_dynamics_model(self, env_id='Acrobot-v1', name='mlp_dyna'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + 'mlp_dyna', name=name + 'mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": None, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) return mlp_dyna, locals()
def create_mlp_q_func(self, env_id='Acrobot-v1', name='mlp_q'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name, name=name, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03, "L1_NORM": 0.2, "L2_NORM": 0.1 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) return mlp_q, locals()
def test_correctness(self): env_id = 'Pendulum-v0' env = make(env_id) n = env.observation_space.flat_dim env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) F = np.ones([env.observation_space.flat_dim, env.observation_space.flat_dim + env.action_space.flat_dim]) * 0.00001 # F[n:, n:] = 0.0001 dyna = LinearDynamicsModel(env_spec=env_spec, state_transition_matrix=F, bias=np.zeros([env.observation_space.flat_dim])) C = np.ones([env.observation_space.flat_dim + env.action_space.flat_dim, env.observation_space.flat_dim + env.action_space.flat_dim]) * 0.00001 c = np.ones([env.observation_space.flat_dim + env.action_space.flat_dim]) c[n:] = -1000 # C[:n, :] = 0. # C[:, :n] = 0. # c[:n] = 0.0 cost_fn = QuadraticCostFunc(C=C, c=c) policy = LQRPolicy(env_spec=env_spec, T=5, dynamics=dyna, cost_fn=cost_fn) st = env.reset() * 0.0 for i in range(10): ac = policy.forward(st) st = dyna.step(action=ac, state=st, allow_clip=True) print(cost_fn(state=st, action=ac, new_state=None)) print(st, ac)
def test_init(self): env = make('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope='mlp_q', name='mlp_q', state_input=state_input, output_low=None, output_high=None, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) mlp_v.init() mlp_v.forward(obs=env.observation_space.sample())
def test_trajectory_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TrajectoryData(env_spec) tmp_traj = TransitionData(env_spec) st = env.reset() re_list = [] st_list = [] for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) st_list.append(st_new) re_list.append(re) if (i + 1) % 10 == 0: done = True else: done = False tmp_traj.append(state=st, new_state=st_new, action=ac, done=done, reward=re) if done is True: a.append(tmp_traj) tmp_traj.reset() self.assertEqual(a.trajectories.__len__(), 10) for traj in a.trajectories: self.assertEqual(len(traj), 10) data = a.return_as_transition_data() data_gen = data.return_generator() for d, re, st in zip(data_gen, re_list, st_list): self.assertEqual(d[3], re) self.assertTrue(np.equal(st, d[1]).all())
def __init__(self, dmcs_env_id: str, name: str = None): """ :param dmcs_env_id: :param name: """ super().__init__(name=name if name else dmcs_env_id) self.env_id = dmcs_env_id self.timestep = {} try: self.env = suite.load(dmcs_env_id, name) except ValueError: raise ValueError('Env id: {} and task: {} is not supported currently'.format(dmcs_env_id, name)) self.metadata = {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': int(np.round(1.0 / self.env.control_timestep()))} self.action_space = convert_dm_control_to_gym_space(self.env.action_spec()) self.observation_space = convert_dm_control_to_gym_space(self.env.observation_spec()) if isinstance(self.action_space, garage_space.Box): self.action_space.low = np.nan_to_num(self.action_space.low) self.action_space.high = np.nan_to_num(self.action_space.high) self.action_space.sample = types.MethodType(self._sample_with_nan, self.action_space) if isinstance(self.observation_space, garage_space.Box): self.observation_space.low = np.nan_to_num(self.observation_space.low) self.observation_space.high = np.nan_to_num(self.observation_space.high) self.observation_space.sample = types.MethodType(self._sample_with_nan, self.observation_space) self.env_spec = EnvSpec(obs_space=self.observation_space, action_space=self.action_space) self.viewer = None
def test_copy(self): env = make('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope='mlp_v', name='mlp_v', state_input=state_input, output_low=None, output_high=None, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) mlp_v.init() new_mlp = mlp_v.make_copy(name='new_mlp', name_scope='mlp_v', reuse=True) new_mlp.init() self.assertGreater(len(mlp_v.parameters('tf_var_list')), 0) self.assertGreater(len(new_mlp.parameters('tf_var_list')), 0) for var1, var2 in zip(mlp_v.parameters('tf_var_list'), new_mlp.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertEqual(id(var1), id(var2)) not_reuse_mlp = mlp_v.make_copy(name='no-reuse-mlp', name_scope='mlp_no_reuse', reuse=False) not_reuse_mlp.init() self.assertGreater(len(not_reuse_mlp.parameters('tf_var_list')), 0) for var1, var2 in zip(mlp_v.parameters('tf_var_list'), not_reuse_mlp.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertNotEqual(id(var1), id(var2))
def create_continue_dynamics_model(self, env_id='Acrobot-v1', name='mlp_dyna'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna, _ = self.create_continuous_mlp_global_dynamics_model( env_spec=env_spec, name=name) return mlp_dyna, locals()
def mountaincar_task_fn(): exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCar-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) dqn = DQN(env_spec=env_spec, name=name + '_dqn', value_func=mlp_q, **exp_config['DQN']) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), **exp_config['EpsilonGreedy']['LinearScheduler']), **exp_config['EpsilonGreedy']['config_or_config_dict'])) flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_dynamics_model_in_pendulum(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() for i in range(len(data.state_set)): res = gp.step(action=data.action_set[i], state=data.state_set[i], allow_clip=True) _, var = gp._state_transit(action=data.action_set[i], state=data.state_set[i], required_var=True) print(res) print(data.new_state_set[i]) print(np.sqrt(var)) # self.assertTrue(np.isclose(res, # data.new_state_set[i], atol=1e-3).all()) self.assertTrue(np.greater(data.new_state_set[i] + 1.96 * np.sqrt(var), res).all()) self.assertTrue(np.less(data.new_state_set[i] - 1.96 * np.sqrt(var), res).all()) lengthscales = {} variances = {} noises = {} for i, model in enumerate(gp.mgpr_model.models): lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises))
def test_dynamics_model_basic(self): env = self.create_env('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy, _ = self.create_uniform_policy(env_spec=env_spec) data = TransitionData(env_spec=env_spec) st = env.reset() ac = policy.forward(st) for i in range(10): re = 0.0 data.append(state=np.ones_like(st) * 0.5, new_state=np.ones_like(st), reward=re, done=False, action=np.ones_like(ac) * 0.1) data.append(state=np.ones_like(st), new_state=np.ones_like(st) * 0.5, reward=re, done=False, action=np.ones_like(ac) * -0.1) gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() lengthscales = {} variances = {} noises = {} i = 0 for model in gp.mgpr_model.models: lengthscales['GP' + str(i)] = model.kern.lengthscales.value variances['GP' + str(i)] = np.array([model.kern.variance.value]) noises['GP' + str(i)] = np.array([model.likelihood.variance.value]) i += 1 print('-----Learned models------') pd.set_option('precision', 3) print('---Lengthscales---') print(pd.DataFrame(data=lengthscales)) print('---Variances---') print(pd.DataFrame(data=variances)) print('---Noises---') print(pd.DataFrame(data=noises)) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st)), np.ones_like(st) * 0.5).all()) for i in range(5): self.assertTrue(np.isclose(gp.step(action=np.ones_like(ac) * 0.1, state=np.ones_like(st) * 0.5), np.ones_like(st)).all()) for i in range(5): print(gp.step(action=np.ones_like(ac) * -0.1, state=np.ones_like(st) * 0.5))
def init(self): if self._inited_flag: print('Warning: Current env has been initialized. Check if env.inited() has been called multiple times') print('Warning: Duplicated env initialization has been ignored') return self._status.set_status('INITED') self.action_space = space_converter(self.action_space) self.observation_space = space_converter(self.observation_space) if isinstance(self.action_space, garage_space.Box): self.action_space.low = np.nan_to_num(self.action_space.low) self.action_space.high = np.nan_to_num(self.action_space.high) self.action_space.sample = types.MethodType(self._sample_with_nan, self.action_space) if isinstance(self.observation_space, garage_space.Box): self.observation_space.low = np.nan_to_num(self.observation_space.low) self.observation_space.high = np.nan_to_num(self.observation_space.high) self.observation_space.sample = types.MethodType(self._sample_with_nan, self.observation_space) self.env_spec = EnvSpec(obs_space=self.observation_space, action_space=self.action_space) self._inited_flag = True
def create_ilqr_policy(self, env_id='Pendulum-v0'): class DebuggingCostFunc(CostFunc): def __call__(self, state=None, action=None, new_state=None, **kwargs) -> float: return float(np.sum(action * action)) env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) dyna = UniformRandomDynamicsModel(env_spec=env_spec) dyna.init() policy = iLQRPolicy(env_spec=env_spec, T=50, delta=0.0005, iteration=5, dynamics=dyna, cost_fn=DebuggingCostFunc()) return policy, locals()
def test_transition_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = UniformRandomReplayBuffer(limit=10000, action_shape=env_spec.action_shape, observation_shape=env_spec.obs_shape) st = env.reset() for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) a.append(obs0=st, obs1=st_new, action=ac, reward=re, terminal1=done) st = st_new batch = a.sample(batch_size=10) self.assertTrue(batch.state_set.shape[0] == 10) self.assertTrue(batch.action_set.shape[0] == 10) self.assertTrue(batch.reward_set.shape[0] == 10) self.assertTrue(batch.done_set.shape[0] == 10) self.assertTrue(batch.new_state_set.shape[0] == 10)
def create_env_spec(self, env): return EnvSpec(action_space=env.action_space, obs_space=env.observation_space)
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', **exp_config['MLP_V']) policy = NormalDistributionMLPPolicy( env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', **exp_config['POLICY'], output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, reuse=False) ppo = PPO(env_spec=env_spec, **exp_config['PPO'], value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo') agent = Agent(env=env, env_spec=env_spec, algo=ppo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT'], sample_trajectory_flag=True), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, sample_type='trajectory', in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel']) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training', train_iter=1) }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=50, sample_type='transition', env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=False) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + 'mlp_q', name=name + 'mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) self.assertTrue(len(policy.parameters('tf_var_list')) == 4) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name, replay_buffer=None) return ddpg, locals()
This gives a simple example on how to use Gaussian Process (GP) to approximate the Gym environment Pendulum-v0 We use gpflow package to build the Gaussian Process. """ from baconian.core.core import EnvSpec from baconian.envs.gym_env import make import numpy as np from baconian.common.sampler.sample_data import TransitionData from baconian.algo.policy import UniformRandomPolicy from baconian.algo.dynamics.gaussian_process_dynamiocs_model import GaussianProcessDyanmicsModel from baconian.algo.dynamics.dynamics_model import DynamicsEnvWrapper from baconian.algo.dynamics.terminal_func.terminal_func import RandomTerminalFunc from baconian.algo.dynamics.reward_func.reward_func import RandomRewardFunc env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train GP model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() dyna_env = DynamicsEnvWrapper(dynamics=gp)
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) agent = Agent( env=env, env_spec=env_spec, algo=ddpg, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0 ), init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug' ) dqn.parameters.set_scheduler(param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def test_l1_l2_norm(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) name = 'dqn' mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp', name=name + '_mlp', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03, "L1_NORM": 1000.0, "L2_NORM": 1000.0 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "L1_NORM": 1000.0, "L2_NORM": 1000.0, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name, value_func=mlp_q) dqn2, _ = self.create_dqn(name='dqn_2') a = TransitionData(env_spec) st = env.reset() dqn.init() dqn2.init() for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) for i in range(20): print( 'dqn1 loss: ', dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print( 'dqn2 loss: ', dqn2.train(batch_data=a, train_iter=10, sess=None, update_target=True)) var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) print(var_list) var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list')) print(var_list2) for var, var2 in zip(var_list, var_list2): diff = np.abs(var2) - np.abs(var) self.assertTrue(np.greater(np.mean(diff), 0.0).all())
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "L1_NORM": 0.01, "L2_NORM": 0.01, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.01, "L2_NORM": 0.01, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ppo = PPO(env_spec=env_spec, config_or_config_dict={ "gamma": 0.995, "lam": 0.98, "policy_train_iter": 10, "value_func_train_iter": 10, "clipping_range": None, "beta": 1.0, "eta": 50, "log_var_init": -1.0, "kl_target": 0.003, "policy_lr": 0.01, "value_func_lr": 0.01, "value_func_train_batch_size": 10, "lr_multiplier": 1.0 }, value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo') agent = Agent( env=env, env_spec=env_spec, algo=ppo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + 'agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()