示例#1
0
    # 2. Construct the network and specify the algorithm.
    #    Here we use a small CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
        nn.ReLU())

    alg = SimpleAC(model=SimpleModelAC(dims=(d, h, w),
                                       num_actions=num_actions,
                                       perception_net=cnn),
                   gpu_id=1)

    # 3. Specify the settings for learning: data sampling strategy
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            algorithm=alg,
            hyperparas=dict(grad_clip=5.0),
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=5,
            num_agents=num_agents)
示例#2
0
    def test_ct_learning(self):
        """
        Test training
        """
        num_actions = 2
        dims = 100
        batch_size = 8
        sensor = np.ones(
            [batch_size, dims]).astype("float32") / dims  # normalize
        next_sensor = np.zeros([batch_size, dims]).astype("float32")

        for on_policy in [True, False]:
            if on_policy:
                alg = SimpleAC(model=SimpleModelAC(
                    dims=dims,
                    num_actions=num_actions,
                    mlp=nn.Sequential(
                        nn.Linear(
                            dims, 64, bias=False),
                        nn.ReLU(),
                        nn.Linear(
                            64, 32, bias=False),
                        nn.ReLU())))
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))
            else:
                alg = SimpleQ(
                    model=SimpleModelQ(
                        dims=dims,
                        num_actions=num_actions,
                        mlp=nn.Sequential(
                            nn.Linear(
                                dims, 64, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                64, 32, bias=False),
                            nn.ReLU(),
                            nn.Linear(
                                32, num_actions, bias=False))),
                    update_ref_interval=100)
                ct = ComputationTask(
                    "test", algorithm=alg, hyperparas=dict(lr=1e-1))

            for i in range(1000):
                if on_policy:
                    outputs, _ = ct.predict(inputs=dict(sensor=sensor))
                    actions = outputs["action"]
                else:
                    ## randomly assemble a batch
                    actions = np.random.choice(
                        [0, 1], size=(batch_size, 1),
                        p=[0.5, 0.5]).astype("int")
                rewards = (1.0 - actions).astype("float32")
                cost = ct.learn(
                    inputs=dict(sensor=sensor),
                    next_inputs=dict(sensor=next_sensor),
                    next_alive=dict(alive=np.zeros(
                        (batch_size, 1)).astype("float32")),
                    actions=dict(action=actions),
                    rewards=dict(reward=rewards))

            ### the policy should bias towards the first action
            outputs, _ = ct.predict(inputs=dict(sensor=sensor))
            for a in outputs["action"]:
                self.assertEqual(a[0], 0)
示例#3
0
    num_games = 8000
    # 1. Create environments
    envs = []
    for _ in range(num_agents):
        envs.append(GymEnv(game))
    state_shape = envs[-1].observation_dims()[0]
    num_actions = envs[-1].action_dims()[0]

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small MLP and apply the Actor-Critic algorithm
    mlp = nn.Sequential(
        nn.Linear(state_shape[0], 128),
        nn.ReLU(),
        nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU())

    alg = SimpleAC(model=SimpleModelAC(
        dims=state_shape, num_actions=num_actions, perception_net=mlp))

    # 3. Specify the settings for learning: data sampling strategy
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL": dict(
            algorithm=alg,
            hyperparas=dict(lr=5e-5),
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=4,
            num_agents=num_agents)
    }
示例#4
0
    reward_shaping_f = lambda x: x / 100.0
    agents = []
    for _ in range(num_agents):
        agent = SimpleRNNRLAgent(num_games, reward_shaping_f=reward_shaping_f)
        agent.set_env(GymEnv, game_name=game)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small MLP and apply the Actor-Critic algorithm
    hidden_size = 128
    mlp = nn.Sequential(nn.Linear(state_shape[0], hidden_size), nn.ReLU(),
                        nn.Linear(hidden_size, hidden_size), nn.ReLU())

    alg = SimpleAC(model=SimpleRNNModelAC(dims=state_shape,
                                          num_actions=num_actions,
                                          perception_net=mlp),
                   optim=(optim.RMSprop, dict(lr=1e-4)),
                   ntd=True)

    # 3. Specify the settings for learning: the algorithm to use (SimpleAC
    # in this case), data sampling strategy (OnlineHelper here) and other
    # settings used by ComputationTask.
    ct_settings = {
        "RL":
        dict(
            alg=alg,
            # sampling
            agent_helper=OnlineHelper,
            sample_interval=8,
            num_agents=num_agents)
    }
示例#5
0
        nn.Conv2d(32, 32, kernel_size=5, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(32, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(64, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(1920, 512),
        nn.ReLU())

    alg = SimpleAC(model=SimpleModelAC(dims=(d, h, w),
                                       num_actions=num_actions,
                                       perception_net=cnn),
                   optim=(optim.RMSprop, dict(lr=1e-4)),
                   gpu_id=1)

    # 3. Specify the settings for learning: data sampling strategy
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            alg=alg,
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=2,
            num_agents=num_agents)
示例#6
0
    reward_shaping_f = lambda x: x / 100.0
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(num_games, reward_shaping_f=reward_shaping_f)
        agent.set_env(GymEnv, game_name=game)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small MLP and apply the Actor-Critic algorithm
    mlp = nn.Sequential(nn.Linear(state_shape[0], 128), nn.ReLU(),
                        nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128),
                        nn.ReLU())

    alg = SimpleAC(model=SimpleModelAC(dims=state_shape,
                                       num_actions=num_actions,
                                       perception_net=mlp),
                   optim=(optim.RMSprop, dict(lr=5e-5)),
                   gpu_id=-1)  ## use cpu

    # 3. Specify the settings for learning: data sampling strategy
    # (OnlineHelper here) and other settings used by
    # ComputationTask.
    ct_settings = {
        "RL":
        dict(
            alg=alg,
            # sampling
            agent_helper=OnlineHelper,
            # each agent will call `learn()` every `sample_interval` steps
            sample_interval=4,
            num_agents=num_agents)
示例#7
0
    def test_gym_games(self):
        """
        Test games in OpenAI gym.
        """

        games = ["MountainCar-v0", "CartPole-v0"]
        final_rewards_thresholds = [
            -1.8,  ## drive to the right top in 180 steps (timeout is -2.0)
            1.5  ## hold the pole for at least 150 steps
        ]

        for game, threshold in zip(games, final_rewards_thresholds):
            for on_policy in [False, True]:

                if on_policy and game != "CartPole-v0":
                    ## SimpleAC has difficulty training mountain-car and acrobot
                    continue

                env = gym.make(game)
                state_shape = env.observation_space.shape[0]
                num_actions = env.action_space.n

                mlp = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU(),
                                    nn.Linear(128, 128), nn.ReLU())

                if on_policy:
                    alg = SimpleAC(model=SimpleModelAC(dims=state_shape,
                                                       num_actions=num_actions,
                                                       mlp=mlp),
                                   hyperparas=dict(lr=1e-3))
                else:
                    alg = SimpleQ(model=SimpleModelQ(
                        dims=state_shape,
                        num_actions=num_actions,
                        mlp=nn.Sequential(mlp, nn.Linear(128, num_actions))),
                                  hyperparas=dict(lr=1e-4),
                                  exploration_end_batches=25000,
                                  update_ref_interval=100)

                print "algorithm: " + alg.__class__.__name__

                ct = ComputationTask(algorithm=alg)
                batch_size = 16
                if not on_policy:
                    train_every_steps = batch_size / 4
                    buffer_size_limit = 100000

                max_episode = 5000

                average_episode_reward = []
                past_exps = []
                max_steps = env._max_episode_steps
                for n in range(max_episode):
                    ob = env.reset()
                    episode_reward = 0
                    for t in range(max_steps):
                        res, _ = ct.predict(inputs=dict(
                            sensor=np.array([ob]).astype("float32")))
                        pred_action = res["action"][0][0]

                        next_ob, reward, next_is_over, _ = env.step(
                            pred_action)
                        reward /= 100
                        episode_reward += reward

                        past_exps.append((ob, next_ob, [pred_action], [reward],
                                          [not next_is_over]))
                        ## only for off-policy training we use a circular buffer
                        if (not on_policy
                            ) and len(past_exps) > buffer_size_limit:
                            past_exps.pop(0)

                        ## compute the learning condition
                        learn_cond = False
                        if on_policy:
                            learn_cond = (len(past_exps) >= batch_size)
                            exps = past_exps  ## directly use all exps in the buffer
                        else:
                            learn_cond = (
                                t % train_every_steps == train_every_steps - 1)
                            exps = sample(past_exps,
                                          batch_size)  ## sample some exps

                        if learn_cond:
                            sensor, next_sensor, action, reward, next_episode_end \
                                = unpack_exps(exps)
                            cost = ct.learn(
                                inputs=dict(sensor=sensor),
                                next_inputs=dict(next_sensor=next_sensor),
                                next_episode_end=dict(
                                    next_episode_end=next_episode_end),
                                actions=dict(action=action),
                                rewards=dict(reward=reward))
                            ## we clear the exp buffer for on-policy
                            if on_policy:
                                past_exps = []

                        ob = next_ob

                        ## end before the Gym wrongly gives game_over=True for a timeout case
                        if t == max_steps - 2 or next_is_over:
                            break

                    if n % 50 == 0:
                        print("episode reward: %f" % episode_reward)

                    average_episode_reward.append(episode_reward)
                    if len(average_episode_reward) > 20:
                        average_episode_reward.pop(0)

                ### compuare the average episode reward to reduce variance
                self.assertGreater(
                    sum(average_episode_reward) / len(average_episode_reward),
                    threshold)