def main(_): batch_size = 64 # used in qfunc and runner. env = environment_impl.GymEnvironment(gym.make('CartPole-v0')) qfunc = qfunc_impl.DQN( model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(20, 20, 20)), training_batch_size=batch_size, discount_factor=0.99, ) runner = runner_impl.ExperienceReplayRunner( experience_capacity=100000, experience_sample_batch_size=batch_size) # Train 500 episodes. logging.ENV.debug_verbosity = 3 policy = policy_impl.GreedyPolicyWithRandomness(epsilon=0.1) runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=500) # Test for 100 episodes. logging.ENV.debug_verbosity = 4 policy = policy_impl.GreedyPolicy() runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=100) # Demo with video. env.TurnOnRendering(should_render=True, fps=24) # env.StartRecording(video_filename='demo.mp4') # uncomment to record video. # First 5 runs with random actions: runner.Run(env=env, brain=qfunc_impl.RandomQFunction(env.GetActionSpaceSize()), policy=policy, num_of_episodes=5) # Then 10 runs with trained qfunc: runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=10)
def _RunEnv(gym_env): env = screen_learning.ScreenGymEnvironment(gym_env) qfunc = qfunc_impl.DQN_TargetNetwork( model=screen_learning.CreateConvolutionModel( action_space_size=env.GetActionSpaceSize())) policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner_impl.SimpleRunner().Run( env=env, qfunc=qfunc, policy=policy, num_of_episodes=10)
def main(_): env = environment_impl.GymEnvironment(gym.make('Seaquest-v0')) env.TurnOnRendering(should_render=True, fps=24) qfunc = qfunc_impl.RandomValueQFunction( action_space_size=env.GetActionSpaceSize()) policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner = runner_impl.NoOpRunner() runner.Run(env, qfunc, policy, num_of_episodes=10)
def _RunEnv(gym_env): env = environment_impl.GymEnvironment(gym_env) qfunc = qfunc_impl.RandomValueQFunction( action_space_size=env.GetActionSpaceSize()) env.Reset() policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner_impl.SimpleRunner().Run( env=env, qfunc=qfunc, policy=policy, num_of_episodes=10)
def _RunEnv(gym_env): env = environment_impl.GymEnvironment(gym_env) env.SetGymEnvMaxEpisodeSteps(10) qfunc = qfunc_impl.MemoizationQFunction( action_space_size=env.GetActionSpaceSize()) env.Reset() policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner_impl.SimpleRunner().Run(env=env, brain=qfunc, policy=policy, num_of_episodes=1)
def _RunEnv(gym_env): env = environment_impl.GymEnvironment(gym_env) env.SetGymEnvMaxEpisodeSteps(10) qfunc = qfunc_impl.DQN(model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(4, ), )) env.Reset() policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner_impl.SimpleRunner().Run(env=env, qfunc=qfunc, policy=policy, num_of_episodes=10)
def test_GreedyPolicyWithRandomness_choosesNonOptimalAction(self): mock_qfunc = mock.MagicMock() mock_qfunc.GetValues.return_value = numpy.array([[0.3, 0.7]]) policy = policy_impl.GreedyPolicyWithRandomness(epsilon=0.5) for _ in range(500): policy.Decide(env=environment_impl.SingleStateEnvironment( action_space_size=2, step_limit=10), brain=mock_qfunc, state=numpy.array([[0]]), episode_idx=0, num_of_episodes=500) # Tests that roughly half of the time qfunc is not used to make the # decision. self.assertGreater(mock_qfunc.GetValues.call_count, 200) self.assertLess(mock_qfunc.GetValues.call_count, 300)
def test_GreedyPolicyWithRandomness_considerAllActions(self): mock_qfunc = mock.MagicMock() mock_qfunc.GetValues.return_value = numpy.array([[0.3, 0.7]]) env = environment_impl.SingleStateEnvironment(action_space_size=2, step_limit=10) policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) choices = [] for _ in range(500): choices.append( env.GetChoiceFromAction( policy.Decide(env=env, brain=mock_qfunc, state=numpy.array([[0]]), episode_idx=0, num_of_episodes=500))) # Tests that roughly half of the time action 0 is chosen. num_of_0s = len([c for c in choices if c == 0]) self.assertGreater(num_of_0s, 200) self.assertLess(num_of_0s, 300)
def test_GreedyPolicyWithRandomness(self): # Tests that it can run; quality if not important for this test. self.runner.Run( env=self.env, qfunc=self.qfunc, policy=policy_impl.GreedyPolicyWithRandomness(epsilon=0.1), num_of_episodes=1)