def test_predict(self): """ Test case for AC-learning and Q-learning predictions """ num_actions = 4 def test(input, ct, max): action_counter = [0] * num_actions total = 3000 for i in range(total): actions, states = ct.predict(inputs=input) assert not states, "states should be empty" ## actions["action"] is a batch of actions for a in actions["action"]: action_counter[a[0]] += 1 if max: ### if max, some action will always be chosen (which action is ### chosen depends on the network initialization count = 0 for i in range(num_actions): prob = action_counter[i] / float(sum(action_counter)) if abs(prob - 1.0) < 1e-1: count = count + 1 self.assertEqual(count, 1) else: ### the actions should be uniform for i in range(num_actions): prob = action_counter[i] / float(sum(action_counter)) self.assertAlmostEqual(prob, 1.0 / num_actions, places=1) dims = 100 q_cnn = SimpleQ(model=TestModelCNN( width=84, height=84, num_actions=num_actions)) q = SimpleQ(model=SimpleModelQ( dims=[dims], num_actions=num_actions, perception_net=nn.Sequential( nn.Linear( dims, 32, bias=False), nn.ReLU(), nn.Linear( 32, 16, bias=False), nn.ReLU()))) batch_size = 10 height, width = 84, 84 sensor = np.zeros([batch_size, dims]).astype("float32") image = np.zeros([batch_size, 1, height, width]).astype("float32") ct0 = ComputationTask("test", algorithm=q_cnn) ct1 = ComputationTask("test", algorithm=q) test(dict(image=image), ct0, max=False) test(dict(sensor=sensor), ct1, max=True)
def test_predict(self): """ Test case for AC-learning and Q-learning predictions """ num_actions = 4 def test(input, ct, max): action_counter = [0] * num_actions total = 2000 for i in range(total): actions, states = ct.predict(inputs=input) assert not states, "states should be empty" ## actions["action"] is a batch of actions for a in actions["action"]: action_counter[a[0]] += 1 if max: ### if max, the first action will always be chosen for i in range(num_actions): prob = action_counter[i] / float(sum(action_counter)) self.assertAlmostEqual(prob, 1.0 if i == 0 else 0.0, places=1) else: ### the actions should be uniform for i in range(num_actions): prob = action_counter[i] / float(sum(action_counter)) self.assertAlmostEqual(prob, 1.0 / num_actions, places=1) dims = 100 q_cnn = SimpleQ( model=TestModelCNN(width=84, height=84, num_actions=num_actions)) q = SimpleQ(model=SimpleModelQ( dims=dims, num_actions=num_actions, mlp=nn.Sequential(nn.Linear(dims, 32, bias=False), nn.ReLU(), nn.Linear(32, 16, bias=False), nn.ReLU(), nn.Linear(16, num_actions, bias=False)))) batch_size = 10 height, width = 84, 84 sensor = np.zeros([batch_size, dims]).astype("float32") image = np.zeros([batch_size, 1, height, width]).astype("float32") ct0 = ComputationTask(algorithm=q_cnn) ct1 = ComputationTask(algorithm=q) test(dict(image=image), ct0, max=False) test(dict(sensor=sensor), ct1, max=True)
def test_ct_learning(self): """ Test training """ num_actions = 2 dims = 100 batch_size = 8 sensor = np.ones( [batch_size, dims]).astype("float32") / dims # normalize next_sensor = np.zeros([batch_size, dims]).astype("float32") for on_policy in [True, False]: if on_policy: alg = SimpleAC(model=SimpleModelAC( dims=dims, num_actions=num_actions, mlp=nn.Sequential( nn.Linear( dims, 64, bias=False), nn.ReLU(), nn.Linear( 64, 32, bias=False), nn.ReLU()))) ct = ComputationTask( "test", algorithm=alg, hyperparas=dict(lr=1e-1)) else: alg = SimpleQ( model=SimpleModelQ( dims=dims, num_actions=num_actions, mlp=nn.Sequential( nn.Linear( dims, 64, bias=False), nn.ReLU(), nn.Linear( 64, 32, bias=False), nn.ReLU(), nn.Linear( 32, num_actions, bias=False))), update_ref_interval=100) ct = ComputationTask( "test", algorithm=alg, hyperparas=dict(lr=1e-1)) for i in range(1000): if on_policy: outputs, _ = ct.predict(inputs=dict(sensor=sensor)) actions = outputs["action"] else: ## randomly assemble a batch actions = np.random.choice( [0, 1], size=(batch_size, 1), p=[0.5, 0.5]).astype("int") rewards = (1.0 - actions).astype("float32") cost = ct.learn( inputs=dict(sensor=sensor), next_inputs=dict(sensor=next_sensor), next_alive=dict(alive=np.zeros( (batch_size, 1)).astype("float32")), actions=dict(action=actions), rewards=dict(reward=rewards)) ### the policy should bias towards the first action outputs, _ = ct.predict(inputs=dict(sensor=sensor)) for a in outputs["action"]: self.assertEqual(a[0], 0)
for _ in range(num_agents): envs.append(GymEnv(game)) state_shape = envs[-1].observation_dims()[0] num_actions = envs[-1].action_dims()[0] # 2. Construct the network and specify the algorithm. # Here we use a small MLP and apply the Q-learning algorithm inner_size = 256 mlp = nn.Sequential( nn.Linear(state_shape[0], inner_size), nn.ReLU(), nn.Linear(inner_size, inner_size), nn.ReLU(), nn.Linear(inner_size, inner_size), nn.ReLU()) alg = SimpleQ( model=SimpleModelQ( dims=state_shape, num_actions=num_actions, perception_net=mlp), exploration_end_steps=500000 / num_agents, update_ref_interval=100) # 3. Specify the settings for learning: the algorithm to use (SimpleAC # in this case), data sampling strategy (ExpReplayHelper here) and other # settings used by ComputationTask. ct_settings = { "RL": dict( num_agents=num_agents, algorithm=alg, hyperparas=dict(lr=1e-4), # sampling agent_helper=ExpReplayHelper, buffer_capacity=200000 / num_agents, num_experiences=4, # num per agent
# 2. Construct the network and specify the algorithm. # Here we use a small CNN as the perception net for the Actor-Critic algorithm cnn = nn.Sequential( nn.Conv2d(d, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), Flatten(), # flatten the CNN cube to a vector nn.Linear(7 * 7 * 64, 512), nn.ReLU()) alg = SimpleQ(model=SimpleModelQ(dims=(d, h, w), num_actions=num_actions, perception_net=cnn), gpu_id=0, exploration_end_steps=500000 / num_agents, update_ref_interval=100) # 3. Specify the settings for learning: data sampling strategy # (ExpReplayHelper here) and other settings used by # ComputationTask. ct_settings = { "RL": dict( num_agents=num_agents, algorithm=alg, hyperparas=dict(lr=1e-4, grad_clip=5.0), # sampling
def test_gym_games(self): """ Test games in OpenAI gym. """ games = ["MountainCar-v0", "CartPole-v0", "Pendulum-v0"] final_rewards_thresholds = [ -1.5, ## drive to the right top in 150 steps (timeout is -2.0) 1.5, ## hold the pole for at least 150 steps -3.0 ## can swing the stick to the top most of the times ] on_policies = [False, True, False] discrete_actions = [True, True, False] for game, threshold, on_policy, discrete_action in \ zip(games, final_rewards_thresholds, on_policies, discrete_actions): env = gym.make(game) state_shape = env.observation_space.shape[0] if discrete_action: num_actions = env.action_space.n else: num_actions = env.action_space.shape[0] hidden_size = 256 mlp = nn.Sequential( nn.Linear(state_shape, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()) q_model = SimpleModelQ( dims=state_shape, num_actions=num_actions, mlp=nn.Sequential(mlp, nn.Linear(hidden_size, num_actions))) if on_policy: alg = SimpleSARSA(model=q_model, epsilon=0.1) # alg = SuccessorRepresentationQ( # ## much slower than SARSA because of more things to learn # model=SimpleSRModel( # dims=state_shape, # hidden_size=hidden_size, # num_actions=num_actions, ), # exploration_end_steps=20000) else: if discrete_action: alg = SimpleQ( model=q_model, exploration_end_steps=200000, update_ref_interval=100) else: alg = OffPolicyAC( model=GaussianPolicyModel( dims=state_shape, action_dims=num_actions, mlp=mlp, std=1.0), epsilon=0.2) glog.info("algorithm: " + alg.__class__.__name__) ct = ComputationTask("RL", algorithm=alg, hyperparas=dict(lr=1e-4)) batch_size = 32 if not on_policy: train_every_steps = batch_size / 4 buffer_size_limit = 200000 max_episode = 10000 average_episode_reward = [] past_exps = [] max_steps = env._max_episode_steps for n in range(max_episode): ob = env.reset() episode_reward = 0 alive = 1 for t in range(max_steps): inputs = dict(sensor=np.array([ob]).astype("float32")) res, _ = ct.predict(inputs=inputs) ## when discrete_action is True, this is a scalar ## otherwise it's a floating vector pred_action = res["action"][0] ## end before the env wrongly gives game_over=True for a timeout case if t == max_steps - 1: past_exps.append( (inputs, res, dict(reward=[[0]]), dict(alive=[[-1]]))) ## -1 denotes timeout break elif (not alive): past_exps.append((inputs, res, dict(reward=[[0]]), dict(alive=[[alive]]))) break else: next_ob, reward, next_is_over, _ = env.step( pred_action[0] if discrete_action else pred_action) reward /= 100 episode_reward += reward past_exps.append((inputs, res, dict(reward=[[reward]]), dict(alive=[[alive]]))) ## only for off-policy training we use a circular buffer if (not on_policy) and len(past_exps) > buffer_size_limit: past_exps.pop(0) ## compute the learning condition learn_cond = False if on_policy: learn_cond = (len(past_exps) >= batch_size) else: learn_cond = ( t % train_every_steps == train_every_steps - 1) if learn_cond: exps = sample(past_exps, batch_size) sampled_inputs, next_sampled_inputs, sampled_actions, \ next_sampled_actions, reward, next_alive = unpack_exps(exps) cost = ct.learn( inputs=sampled_inputs, next_inputs=next_sampled_inputs, next_alive=next_alive, actions=sampled_actions, next_actions=next_sampled_actions, rewards=reward) ## we clear the exp buffer for on-policy if on_policy: past_exps = [] ob = next_ob ### bool must be converted to int for correct computation alive = 1 - int(next_is_over) if n % 50 == 0: glog.info("episode reward: %f" % episode_reward) average_episode_reward.append(episode_reward) if len(average_episode_reward) > 20: average_episode_reward.pop(0) ### once hit the threshold, we don't bother running if sum(average_episode_reward) / len( average_episode_reward) > threshold: glog.info( "Test terminates early due to threshold satisfied!") break ### compuare the average episode reward to reduce variance self.assertGreater( sum(average_episode_reward) / len(average_episode_reward), threshold)
def test_gym_games(self): """ Test games in OpenAI gym. """ games = ["MountainCar-v0", "CartPole-v0"] final_rewards_thresholds = [ -1.8, ## drive to the right top in 180 steps (timeout is -2.0) 1.5 ## hold the pole for at least 150 steps ] for game, threshold in zip(games, final_rewards_thresholds): for on_policy in [False, True]: if on_policy and game != "CartPole-v0": ## SimpleAC has difficulty training mountain-car and acrobot continue env = gym.make(game) state_shape = env.observation_space.shape[0] num_actions = env.action_space.n mlp = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU()) if on_policy: alg = SimpleAC(model=SimpleModelAC(dims=state_shape, num_actions=num_actions, mlp=mlp), hyperparas=dict(lr=1e-3)) else: alg = SimpleQ(model=SimpleModelQ( dims=state_shape, num_actions=num_actions, mlp=nn.Sequential(mlp, nn.Linear(128, num_actions))), hyperparas=dict(lr=1e-4), exploration_end_batches=25000, update_ref_interval=100) print "algorithm: " + alg.__class__.__name__ ct = ComputationTask(algorithm=alg) batch_size = 16 if not on_policy: train_every_steps = batch_size / 4 buffer_size_limit = 100000 max_episode = 5000 average_episode_reward = [] past_exps = [] max_steps = env._max_episode_steps for n in range(max_episode): ob = env.reset() episode_reward = 0 for t in range(max_steps): res, _ = ct.predict(inputs=dict( sensor=np.array([ob]).astype("float32"))) pred_action = res["action"][0][0] next_ob, reward, next_is_over, _ = env.step( pred_action) reward /= 100 episode_reward += reward past_exps.append((ob, next_ob, [pred_action], [reward], [not next_is_over])) ## only for off-policy training we use a circular buffer if (not on_policy ) and len(past_exps) > buffer_size_limit: past_exps.pop(0) ## compute the learning condition learn_cond = False if on_policy: learn_cond = (len(past_exps) >= batch_size) exps = past_exps ## directly use all exps in the buffer else: learn_cond = ( t % train_every_steps == train_every_steps - 1) exps = sample(past_exps, batch_size) ## sample some exps if learn_cond: sensor, next_sensor, action, reward, next_episode_end \ = unpack_exps(exps) cost = ct.learn( inputs=dict(sensor=sensor), next_inputs=dict(next_sensor=next_sensor), next_episode_end=dict( next_episode_end=next_episode_end), actions=dict(action=action), rewards=dict(reward=reward)) ## we clear the exp buffer for on-policy if on_policy: past_exps = [] ob = next_ob ## end before the Gym wrongly gives game_over=True for a timeout case if t == max_steps - 2 or next_is_over: break if n % 50 == 0: print("episode reward: %f" % episode_reward) average_episode_reward.append(episode_reward) if len(average_episode_reward) > 20: average_episode_reward.pop(0) ### compuare the average episode reward to reduce variance self.assertGreater( sum(average_episode_reward) / len(average_episode_reward), threshold)