def mountaincar_task_fn(): exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCar-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) dqn = DQN(env_spec=env_spec, name=name + '_dqn', value_func=mlp_q, **exp_config['DQN']) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), **exp_config['EpsilonGreedy']['LinearScheduler']), **exp_config['EpsilonGreedy']['config_or_config_dict'])) flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def create_dqn(self, env_id='Acrobot-v1', name='dqn'): mlp_q, local = self.create_mlp_q_func(env_id, name='{}_mlp_q'.format(name)) env_spec = local['env_spec'] env = local['env'] dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name, value_func=mlp_q) return dqn, locals()
def test_l1_l2_norm(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) name = 'dqn' mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp', name=name + '_mlp', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03, "L1_NORM": 1000.0, "L2_NORM": 1000.0 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "L1_NORM": 1000.0, "L2_NORM": 1000.0, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name, value_func=mlp_q) dqn2, _ = self.create_dqn(name='dqn_2') a = TransitionData(env_spec) st = env.reset() dqn.init() dqn2.init() for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) st = st_new dqn.append_to_memory(a) for i in range(20): print( 'dqn1 loss: ', dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)) print( 'dqn2 loss: ', dqn2.train(batch_data=a, train_iter=10, sess=None, update_target=True)) var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list')) print(var_list) var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list')) print(var_list2) for var, var2 in zip(var_list, var_list2): diff = np.abs(var2) - np.abs(var) self.assertTrue(np.greater(np.mean(diff), 0.0).all())
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0 ), init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug' ) dqn.parameters.set_scheduler(param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env.env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) env_spec = env.env_spec mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 256, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000, GAMMA=0.99, BATCH_SIZE=32, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0), name=name + '_dqn', value_func=mlp_q) epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=int(0.1 * 100000), initial_p=1.0, final_p=0.02), init_random_prob=0.1) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=epsilon_greedy, noise_adder=None) flow = create_train_test_flow( test_every_sample_count=1000, train_every_sample_count=1, start_test_after_sample_count=0, start_train_after_sample_count=10000, sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)), train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=1)), ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_integration_with_dqn(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name='mlp_q', name_scope='mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, name='dqn_test', config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), value_func=mlp_q) dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) dqn.append_to_memory(a) res.append( dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append( dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) self.assertTrue(dqn in dqn.recorder._obj_log) self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn]) self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2) self.assertTrue( np.equal(np.array(res), [ x['log_val'] for x in dqn.recorder._obj_log[dqn]['average_loss'] ]).all()) self.assertTrue(len(Logger()._registered_recorders) > 0) self.assertTrue(dqn.recorder in Logger()._registered_recorders) Logger().flush_recorder()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_integration_with_dqn(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name='mlp_q', name_scope='mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, name='dqn_test', config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name='agent') agent.init() # dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] agent.sample(env=env, sample_count=100, in_which_status='TRAIN', store_flag=True, sample_type='transition') agent.sample(env=env, sample_count=100, in_which_status='TRAIN', store_flag=True, sample_type='transition') res.append(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) self.assertTrue(dqn in dqn.recorder._obj_log) self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn]) self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2) self.assertTrue( np.equal(np.array(res), [x['value'] for x in dqn.recorder._obj_log[dqn]['average_loss']]).all()) self.assertTrue(len(Logger()._registered_recorders) > 0) self.assertTrue(dqn.recorder in Logger()._registered_recorders) res = dqn.recorder.get_log(attr_name='average_loss', filter_by_status=dict()) self.assertEqual(len(res), 2) res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TRAIN'}) self.assertEqual(len(res), 2) res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TEST'}) self.assertEqual(len(res), 0) Logger().flush_recorder()