# 2. Construct the network and specify the algorithm. # Here we use a small CNN as the perception net for the Actor-Critic algorithm cnn = nn.Sequential( nn.Conv2d(d, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(), Flatten(), # flatten the CNN cube to a vector nn.Linear(7 * 7 * 64, 512), nn.ReLU()) alg = SimpleAC(model=SimpleModelAC(dims=(d, h, w), num_actions=num_actions, perception_net=cnn), gpu_id=1) # 3. Specify the settings for learning: data sampling strategy # (OnlineHelper here) and other settings used by # ComputationTask. ct_settings = { "RL": dict( algorithm=alg, hyperparas=dict(grad_clip=5.0), # sampling agent_helper=OnlineHelper, # each agent will call `learn()` every `sample_interval` steps sample_interval=5, num_agents=num_agents)
def test_ct_learning(self): """ Test training """ num_actions = 2 dims = 100 batch_size = 8 sensor = np.ones( [batch_size, dims]).astype("float32") / dims # normalize next_sensor = np.zeros([batch_size, dims]).astype("float32") for on_policy in [True, False]: if on_policy: alg = SimpleAC(model=SimpleModelAC( dims=dims, num_actions=num_actions, mlp=nn.Sequential( nn.Linear( dims, 64, bias=False), nn.ReLU(), nn.Linear( 64, 32, bias=False), nn.ReLU()))) ct = ComputationTask( "test", algorithm=alg, hyperparas=dict(lr=1e-1)) else: alg = SimpleQ( model=SimpleModelQ( dims=dims, num_actions=num_actions, mlp=nn.Sequential( nn.Linear( dims, 64, bias=False), nn.ReLU(), nn.Linear( 64, 32, bias=False), nn.ReLU(), nn.Linear( 32, num_actions, bias=False))), update_ref_interval=100) ct = ComputationTask( "test", algorithm=alg, hyperparas=dict(lr=1e-1)) for i in range(1000): if on_policy: outputs, _ = ct.predict(inputs=dict(sensor=sensor)) actions = outputs["action"] else: ## randomly assemble a batch actions = np.random.choice( [0, 1], size=(batch_size, 1), p=[0.5, 0.5]).astype("int") rewards = (1.0 - actions).astype("float32") cost = ct.learn( inputs=dict(sensor=sensor), next_inputs=dict(sensor=next_sensor), next_alive=dict(alive=np.zeros( (batch_size, 1)).astype("float32")), actions=dict(action=actions), rewards=dict(reward=rewards)) ### the policy should bias towards the first action outputs, _ = ct.predict(inputs=dict(sensor=sensor)) for a in outputs["action"]: self.assertEqual(a[0], 0)
num_games = 8000 # 1. Create environments envs = [] for _ in range(num_agents): envs.append(GymEnv(game)) state_shape = envs[-1].observation_dims()[0] num_actions = envs[-1].action_dims()[0] # 2. Construct the network and specify the algorithm. # Here we use a small MLP and apply the Actor-Critic algorithm mlp = nn.Sequential( nn.Linear(state_shape[0], 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU()) alg = SimpleAC(model=SimpleModelAC( dims=state_shape, num_actions=num_actions, perception_net=mlp)) # 3. Specify the settings for learning: data sampling strategy # (OnlineHelper here) and other settings used by # ComputationTask. ct_settings = { "RL": dict( algorithm=alg, hyperparas=dict(lr=5e-5), # sampling agent_helper=OnlineHelper, # each agent will call `learn()` every `sample_interval` steps sample_interval=4, num_agents=num_agents) }
reward_shaping_f = lambda x: x / 100.0 agents = [] for _ in range(num_agents): agent = SimpleRNNRLAgent(num_games, reward_shaping_f=reward_shaping_f) agent.set_env(GymEnv, game_name=game) agents.append(agent) # 2. Construct the network and specify the algorithm. # Here we use a small MLP and apply the Actor-Critic algorithm hidden_size = 128 mlp = nn.Sequential(nn.Linear(state_shape[0], hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()) alg = SimpleAC(model=SimpleRNNModelAC(dims=state_shape, num_actions=num_actions, perception_net=mlp), optim=(optim.RMSprop, dict(lr=1e-4)), ntd=True) # 3. Specify the settings for learning: the algorithm to use (SimpleAC # in this case), data sampling strategy (OnlineHelper here) and other # settings used by ComputationTask. ct_settings = { "RL": dict( alg=alg, # sampling agent_helper=OnlineHelper, sample_interval=8, num_agents=num_agents) }
nn.Conv2d(32, 32, kernel_size=5, padding=2), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2, 2), Flatten(), # flatten the CNN cube to a vector nn.Linear(1920, 512), nn.ReLU()) alg = SimpleAC(model=SimpleModelAC(dims=(d, h, w), num_actions=num_actions, perception_net=cnn), optim=(optim.RMSprop, dict(lr=1e-4)), gpu_id=1) # 3. Specify the settings for learning: data sampling strategy # (OnlineHelper here) and other settings used by # ComputationTask. ct_settings = { "RL": dict( alg=alg, # sampling agent_helper=OnlineHelper, # each agent will call `learn()` every `sample_interval` steps sample_interval=2, num_agents=num_agents)
reward_shaping_f = lambda x: x / 100.0 agents = [] for _ in range(num_agents): agent = SimpleRLAgent(num_games, reward_shaping_f=reward_shaping_f) agent.set_env(GymEnv, game_name=game) agents.append(agent) # 2. Construct the network and specify the algorithm. # Here we use a small MLP and apply the Actor-Critic algorithm mlp = nn.Sequential(nn.Linear(state_shape[0], 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU()) alg = SimpleAC(model=SimpleModelAC(dims=state_shape, num_actions=num_actions, perception_net=mlp), optim=(optim.RMSprop, dict(lr=5e-5)), gpu_id=-1) ## use cpu # 3. Specify the settings for learning: data sampling strategy # (OnlineHelper here) and other settings used by # ComputationTask. ct_settings = { "RL": dict( alg=alg, # sampling agent_helper=OnlineHelper, # each agent will call `learn()` every `sample_interval` steps sample_interval=4, num_agents=num_agents)
def test_gym_games(self): """ Test games in OpenAI gym. """ games = ["MountainCar-v0", "CartPole-v0"] final_rewards_thresholds = [ -1.8, ## drive to the right top in 180 steps (timeout is -2.0) 1.5 ## hold the pole for at least 150 steps ] for game, threshold in zip(games, final_rewards_thresholds): for on_policy in [False, True]: if on_policy and game != "CartPole-v0": ## SimpleAC has difficulty training mountain-car and acrobot continue env = gym.make(game) state_shape = env.observation_space.shape[0] num_actions = env.action_space.n mlp = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU()) if on_policy: alg = SimpleAC(model=SimpleModelAC(dims=state_shape, num_actions=num_actions, mlp=mlp), hyperparas=dict(lr=1e-3)) else: alg = SimpleQ(model=SimpleModelQ( dims=state_shape, num_actions=num_actions, mlp=nn.Sequential(mlp, nn.Linear(128, num_actions))), hyperparas=dict(lr=1e-4), exploration_end_batches=25000, update_ref_interval=100) print "algorithm: " + alg.__class__.__name__ ct = ComputationTask(algorithm=alg) batch_size = 16 if not on_policy: train_every_steps = batch_size / 4 buffer_size_limit = 100000 max_episode = 5000 average_episode_reward = [] past_exps = [] max_steps = env._max_episode_steps for n in range(max_episode): ob = env.reset() episode_reward = 0 for t in range(max_steps): res, _ = ct.predict(inputs=dict( sensor=np.array([ob]).astype("float32"))) pred_action = res["action"][0][0] next_ob, reward, next_is_over, _ = env.step( pred_action) reward /= 100 episode_reward += reward past_exps.append((ob, next_ob, [pred_action], [reward], [not next_is_over])) ## only for off-policy training we use a circular buffer if (not on_policy ) and len(past_exps) > buffer_size_limit: past_exps.pop(0) ## compute the learning condition learn_cond = False if on_policy: learn_cond = (len(past_exps) >= batch_size) exps = past_exps ## directly use all exps in the buffer else: learn_cond = ( t % train_every_steps == train_every_steps - 1) exps = sample(past_exps, batch_size) ## sample some exps if learn_cond: sensor, next_sensor, action, reward, next_episode_end \ = unpack_exps(exps) cost = ct.learn( inputs=dict(sensor=sensor), next_inputs=dict(next_sensor=next_sensor), next_episode_end=dict( next_episode_end=next_episode_end), actions=dict(action=action), rewards=dict(reward=reward)) ## we clear the exp buffer for on-policy if on_policy: past_exps = [] ob = next_ob ## end before the Gym wrongly gives game_over=True for a timeout case if t == max_steps - 2 or next_is_over: break if n % 50 == 0: print("episode reward: %f" % episode_reward) average_episode_reward.append(episode_reward) if len(average_episode_reward) > 20: average_episode_reward.pop(0) ### compuare the average episode reward to reduce variance self.assertGreater( sum(average_episode_reward) / len(average_episode_reward), threshold)