Пример #1
0
    def test_predict(self):
        """
        Test case for AC-learning and Q-learning predictions
        """
        num_actions = 4

        def test(input, ct, max):
            action_counter = [0] * num_actions
            total = 3000
            for i in range(total):
                actions, states = ct.predict(inputs=input)
                assert not states, "states should be empty"
                ## actions["action"] is a batch of actions
                for a in actions["action"]:
                    action_counter[a[0]] += 1

            if max:
                ### if max, some action will always be chosen (which action is
                ### chosen depends on the network initialization
                count = 0
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    if abs(prob - 1.0) < 1e-1:
                        count = count + 1
                self.assertEqual(count, 1)
            else:
                ### the actions should be uniform
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)

        dims = 100

        q_cnn = SimpleQ(model=TestModelCNN(
            width=84, height=84, num_actions=num_actions))

        q = SimpleQ(model=SimpleModelQ(
            dims=[dims],
            num_actions=num_actions,
            perception_net=nn.Sequential(
                nn.Linear(
                    dims, 32, bias=False),
                nn.ReLU(),
                nn.Linear(
                    32, 16, bias=False),
                nn.ReLU())))

        batch_size = 10
        height, width = 84, 84
        sensor = np.zeros([batch_size, dims]).astype("float32")
        image = np.zeros([batch_size, 1, height, width]).astype("float32")

        ct0 = ComputationTask("test", algorithm=q_cnn)
        ct1 = ComputationTask("test", algorithm=q)

        test(dict(image=image), ct0, max=False)
        test(dict(sensor=sensor), ct1, max=True)
Пример #2
0
    def test_predict(self):
        """
        Test case for AC-learning and Q-learning predictions
        """
        num_actions = 4

        def test(input, ct, max):
            action_counter = [0] * num_actions
            total = 2000
            for i in range(total):
                actions, states = ct.predict(inputs=input)
                assert not states, "states should be empty"
                ## actions["action"] is a batch of actions
                for a in actions["action"]:
                    action_counter[a[0]] += 1

            if max:
                ### if max, the first action will always be chosen
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob,
                                           1.0 if i == 0 else 0.0,
                                           places=1)
            else:
                ### the actions should be uniform
                for i in range(num_actions):
                    prob = action_counter[i] / float(sum(action_counter))
                    self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)

        dims = 100

        q_cnn = SimpleQ(
            model=TestModelCNN(width=84, height=84, num_actions=num_actions))

        q = SimpleQ(model=SimpleModelQ(
            dims=dims,
            num_actions=num_actions,
            mlp=nn.Sequential(nn.Linear(dims, 32, bias=False), nn.ReLU(),
                              nn.Linear(32, 16, bias=False), nn.ReLU(),
                              nn.Linear(16, num_actions, bias=False))))

        batch_size = 10
        height, width = 84, 84
        sensor = np.zeros([batch_size, dims]).astype("float32")
        image = np.zeros([batch_size, 1, height, width]).astype("float32")

        ct0 = ComputationTask(algorithm=q_cnn)
        ct1 = ComputationTask(algorithm=q)

        test(dict(image=image), ct0, max=False)
        test(dict(sensor=sensor), ct1, max=True)
Пример #3
0
    def test_ct_learning(self):
        """
        Test training
        """
        num_actions = 2
        dims = 100
        batch_size = 8
        sensor = np.ones(
            [batch_size, dims]).astype("float32") / dims  # normalize
        next_sensor = np.zeros([batch_size, dims]).astype("float32")

        for on_policy in [True, False]:
            if on_policy:
                alg = SimpleAC(model=SimpleModelAC(
                    dims=dims,
                    num_actions=num_actions,
                    mlp=nn.Sequential(
                        nn.Linear(
                            dims, 64, bias=False),
                        nn.ReLU(),
                        nn.Linear(
                            64, 32, bias=False),
                        nn.ReLU())))
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))
            else:
                alg = SimpleQ(
                    model=SimpleModelQ(
                        dims=dims,
                        num_actions=num_actions,
                        mlp=nn.Sequential(
                            nn.Linear(
                                dims, 64, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                64, 32, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                32, num_actions, bias=False))),
                    update_ref_interval=100)
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))

            for i in range(1000):
                if on_policy:
                    outputs, _ = ct.predict(inputs=dict(sensor=sensor))
                    actions = outputs["action"]
                else:
                    ## randomly assemble a batch
                    actions = np.random.choice(
                        [0, 1], size=(batch_size, 1),
                        p=[0.5, 0.5]).astype("int")
                rewards = (1.0 - actions).astype("float32")
                cost = ct.learn(
                    inputs=dict(sensor=sensor),
                    next_inputs=dict(sensor=next_sensor),
                    next_alive=dict(alive=np.zeros(
                        (batch_size, 1)).astype("float32")),
                    actions=dict(action=actions),
                    rewards=dict(reward=rewards))

            ### the policy should bias towards the first action
            outputs, _ = ct.predict(inputs=dict(sensor=sensor))
            for a in outputs["action"]:
                self.assertEqual(a[0], 0)
Пример #4
0
    for _ in range(num_agents):
        envs.append(GymEnv(game))
    state_shape = envs[-1].observation_dims()[0]
    num_actions = envs[-1].action_dims()[0]

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small MLP and apply the Q-learning algorithm
    inner_size = 256
    mlp = nn.Sequential(
        nn.Linear(state_shape[0], inner_size),
        nn.ReLU(),
        nn.Linear(inner_size, inner_size),
        nn.ReLU(), nn.Linear(inner_size, inner_size), nn.ReLU())

    alg = SimpleQ(
        model=SimpleModelQ(
            dims=state_shape, num_actions=num_actions, perception_net=mlp),
        exploration_end_steps=500000 / num_agents,
        update_ref_interval=100)

    # 3. Specify the settings for learning: the algorithm to use (SimpleAC
    # in this case), data sampling strategy (ExpReplayHelper here) and other
    # settings used by ComputationTask.
    ct_settings = {
        "RL": dict(
            num_agents=num_agents,
            algorithm=alg,
            hyperparas=dict(lr=1e-4),
            # sampling
            agent_helper=ExpReplayHelper,
            buffer_capacity=200000 / num_agents,
            num_experiences=4,  # num per agent
Пример #5
0
    # 2. Construct the network and specify the algorithm.
    #    Here we use a small CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
        nn.ReLU())

    alg = SimpleQ(model=SimpleModelQ(dims=(d, h, w),
                                     num_actions=num_actions,
                                     perception_net=cnn),
                  gpu_id=0,
                  exploration_end_steps=500000 / num_agents,
                  update_ref_interval=100)

    # 3. Specify the settings for learning: data sampling strategy
    # (ExpReplayHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            num_agents=num_agents,
            algorithm=alg,
            hyperparas=dict(lr=1e-4, grad_clip=5.0),
            # sampling
Пример #6
0
    def test_gym_games(self):
        """
        Test games in OpenAI gym.
        """

        games = ["MountainCar-v0", "CartPole-v0", "Pendulum-v0"]
        final_rewards_thresholds = [
            -1.5,  ## drive to the right top in 150 steps (timeout is -2.0)
            1.5,  ## hold the pole for at least 150 steps
            -3.0  ## can swing the stick to the top most of the times
        ]
        on_policies = [False, True, False]
        discrete_actions = [True, True, False]

        for game, threshold, on_policy, discrete_action in \
            zip(games, final_rewards_thresholds, on_policies, discrete_actions):

            env = gym.make(game)
            state_shape = env.observation_space.shape[0]
            if discrete_action:
                num_actions = env.action_space.n
            else:
                num_actions = env.action_space.shape[0]

            hidden_size = 256

            mlp = nn.Sequential(
                nn.Linear(state_shape, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU())

            q_model = SimpleModelQ(
                dims=state_shape,
                num_actions=num_actions,
                mlp=nn.Sequential(mlp, nn.Linear(hidden_size, num_actions)))

            if on_policy:
                alg = SimpleSARSA(model=q_model, epsilon=0.1)
                # alg = SuccessorRepresentationQ(
                #     ## much slower than SARSA because of more things to learn
                #     model=SimpleSRModel(
                #         dims=state_shape,
                #         hidden_size=hidden_size,
                #         num_actions=num_actions, ),
                #     exploration_end_steps=20000)
            else:
                if discrete_action:
                    alg = SimpleQ(
                        model=q_model,
                        exploration_end_steps=200000,
                        update_ref_interval=100)
                else:
                    alg = OffPolicyAC(
                        model=GaussianPolicyModel(
                            dims=state_shape,
                            action_dims=num_actions,
                            mlp=mlp,
                            std=1.0),
                        epsilon=0.2)

            glog.info("algorithm: " + alg.__class__.__name__)

            ct = ComputationTask("RL", algorithm=alg, hyperparas=dict(lr=1e-4))
            batch_size = 32
            if not on_policy:
                train_every_steps = batch_size / 4
                buffer_size_limit = 200000

            max_episode = 10000

            average_episode_reward = []
            past_exps = []
            max_steps = env._max_episode_steps
            for n in range(max_episode):
                ob = env.reset()
                episode_reward = 0
                alive = 1
                for t in range(max_steps):
                    inputs = dict(sensor=np.array([ob]).astype("float32"))
                    res, _ = ct.predict(inputs=inputs)

                    ## when discrete_action is True, this is a scalar
                    ## otherwise it's a floating vector
                    pred_action = res["action"][0]

                    ## end before the env wrongly gives game_over=True for a timeout case
                    if t == max_steps - 1:
                        past_exps.append(
                            (inputs, res, dict(reward=[[0]]),
                             dict(alive=[[-1]])))  ## -1 denotes timeout
                        break
                    elif (not alive):
                        past_exps.append((inputs, res, dict(reward=[[0]]),
                                          dict(alive=[[alive]])))
                        break
                    else:
                        next_ob, reward, next_is_over, _ = env.step(
                            pred_action[0] if discrete_action else pred_action)
                        reward /= 100
                        episode_reward += reward
                        past_exps.append((inputs, res, dict(reward=[[reward]]),
                                          dict(alive=[[alive]])))

                    ## only for off-policy training we use a circular buffer
                    if (not on_policy) and len(past_exps) > buffer_size_limit:
                        past_exps.pop(0)

                    ## compute the learning condition
                    learn_cond = False
                    if on_policy:
                        learn_cond = (len(past_exps) >= batch_size)
                    else:
                        learn_cond = (
                            t % train_every_steps == train_every_steps - 1)

                    if learn_cond:
                        exps = sample(past_exps, batch_size)
                        sampled_inputs, next_sampled_inputs, sampled_actions, \
                            next_sampled_actions, reward, next_alive = unpack_exps(exps)
                        cost = ct.learn(
                            inputs=sampled_inputs,
                            next_inputs=next_sampled_inputs,
                            next_alive=next_alive,
                            actions=sampled_actions,
                            next_actions=next_sampled_actions,
                            rewards=reward)
                        ## we clear the exp buffer for on-policy
                        if on_policy:
                            past_exps = []

                    ob = next_ob
                    ### bool must be converted to int for correct computation
                    alive = 1 - int(next_is_over)

                if n % 50 == 0:
                    glog.info("episode reward: %f" % episode_reward)

                average_episode_reward.append(episode_reward)
                if len(average_episode_reward) > 20:
                    average_episode_reward.pop(0)

                ### once hit the threshold, we don't bother running
                if sum(average_episode_reward) / len(
                        average_episode_reward) > threshold:
                    glog.info(
                        "Test terminates early due to threshold satisfied!")
                    break

            ### compuare the average episode reward to reduce variance
            self.assertGreater(
                sum(average_episode_reward) / len(average_episode_reward),
                threshold)
Пример #7
0
    def test_gym_games(self):
        """
        Test games in OpenAI gym.
        """

        games = ["MountainCar-v0", "CartPole-v0"]
        final_rewards_thresholds = [
            -1.8,  ## drive to the right top in 180 steps (timeout is -2.0)
            1.5  ## hold the pole for at least 150 steps
        ]

        for game, threshold in zip(games, final_rewards_thresholds):
            for on_policy in [False, True]:

                if on_policy and game != "CartPole-v0":
                    ## SimpleAC has difficulty training mountain-car and acrobot
                    continue

                env = gym.make(game)
                state_shape = env.observation_space.shape[0]
                num_actions = env.action_space.n

                mlp = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU())

                if on_policy:
                    alg = SimpleAC(model=SimpleModelAC(dims=state_shape,
                                                       num_actions=num_actions,
                                                       mlp=mlp),
                                   hyperparas=dict(lr=1e-3))
                else:
                    alg = SimpleQ(model=SimpleModelQ(
                        dims=state_shape,
                        num_actions=num_actions,
                        mlp=nn.Sequential(mlp, nn.Linear(128, num_actions))),
                                  hyperparas=dict(lr=1e-4),
                                  exploration_end_batches=25000,
                                  update_ref_interval=100)

                print "algorithm: " + alg.__class__.__name__

                ct = ComputationTask(algorithm=alg)
                batch_size = 16
                if not on_policy:
                    train_every_steps = batch_size / 4
                    buffer_size_limit = 100000

                max_episode = 5000

                average_episode_reward = []
                past_exps = []
                max_steps = env._max_episode_steps
                for n in range(max_episode):
                    ob = env.reset()
                    episode_reward = 0
                    for t in range(max_steps):
                        res, _ = ct.predict(inputs=dict(
                            sensor=np.array([ob]).astype("float32")))
                        pred_action = res["action"][0][0]

                        next_ob, reward, next_is_over, _ = env.step(
                            pred_action)
                        reward /= 100
                        episode_reward += reward

                        past_exps.append((ob, next_ob, [pred_action], [reward],
                                          [not next_is_over]))
                        ## only for off-policy training we use a circular buffer
                        if (not on_policy
                            ) and len(past_exps) > buffer_size_limit:
                            past_exps.pop(0)

                        ## compute the learning condition
                        learn_cond = False
                        if on_policy:
                            learn_cond = (len(past_exps) >= batch_size)
                            exps = past_exps  ## directly use all exps in the buffer
                        else:
                            learn_cond = (
                                t % train_every_steps == train_every_steps - 1)
                            exps = sample(past_exps,
                                          batch_size)  ## sample some exps

                        if learn_cond:
                            sensor, next_sensor, action, reward, next_episode_end \
                                = unpack_exps(exps)
                            cost = ct.learn(
                                inputs=dict(sensor=sensor),
                                next_inputs=dict(next_sensor=next_sensor),
                                next_episode_end=dict(
                                    next_episode_end=next_episode_end),
                                actions=dict(action=action),
                                rewards=dict(reward=reward))
                            ## we clear the exp buffer for on-policy
                            if on_policy:
                                past_exps = []

                        ob = next_ob

                        ## end before the Gym wrongly gives game_over=True for a timeout case
                        if t == max_steps - 2 or next_is_over:
                            break

                    if n % 50 == 0:
                        print("episode reward: %f" % episode_reward)

                    average_episode_reward.append(episode_reward)
                    if len(average_episode_reward) > 20:
                        average_episode_reward.pop(0)

                ### compuare the average episode reward to reduce variance
                self.assertGreater(
                    sum(average_episode_reward) / len(average_episode_reward),
                    threshold)