def main(args):
    game = "Breakout-v0"

    num_agents = 16
    num_games = 8000

    im_height, im_width = 84, 84
    env = GymEnvImage(game,
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
    d, h, w = env.observation_dims()["sensor"]
    num_actions = env.action_dims()["action"]

    # 1. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(num_games, reward_shaping_f=np.sign)
        agent.set_env(GymEnvImage,
                      game_name=game,
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    We use a CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
        nn.ReLU())

    # 3. Specify the algorithm and settings for learning.
    ct_settings = get_settings(cnn, (d, h, w),
                               num_actions,
                               num_agents,
                               name=args.name)

    # 4. Create Manager that handles the running of the whole pipeline
    manager = Manager(ct_settings)
    manager.add_agents(agents)
    manager.start()
示例#2
0
    def run(self, args):
        model = self.make_model(args)
        opt = optim.RMSprop(model.parameters(), lr=args.lr)
        alg = OffPolicyAC(model=model,
                          optim=opt,
                          epsilon=0.2,
                          prob_entropy_weight=args.entropy_w,
                          gpu_id=args.gpu)

        ct_settings = {
            "RL":
            dict(
                alg=alg,
                # sampling
                agent_helper=OnlineHelper,
                agents_per_batch=args.agents_per_batch,
                # each agent will call `learn()` every `sample_interval` steps
                sample_interval=args.history_len)
        }

        log_settings = dict(print_interval=args.log_interval)

        reward_shaping_f = lambda x: x / 100
        agents = []
        for _ in range(args.num_agents):
            agent = SimpleRLAgent(args.num_games,
                                  reward_shaping_f=reward_shaping_f)
            agent.set_env(GymEnv, game_name=args.game)
            agents.append(agent)

        # 4. Create Manager that handles the running of the whole pipeline
        manager = Manager(ct_settings, log_settings)
        manager.add_agents(agents)
        manager.start()

        # 5. compute last reward
        return np.mean(manager.stats['All'].data_q['total_reward'])
示例#3
0
                    height=im_height,
                    width=im_width,
                    gray=True)

    env = env_class(**env_args)
    d, h, w = env.observation_dims()["sensor"]
    num_actions = env.action_dims()["action"]

    # 1. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(
            num_games, reward_shaping_f=np.sign)  # ignore reward magnitude
        agent.set_env(env_class, **env_args)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=5, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(32, 32, kernel_size=5, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(32, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(64, 64, kernel_size=3, padding=1),
示例#4
0
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
    d, h, w = env.observation_dims()["sensor"]
    num_actions = env.action_dims()["action"]

    # 1. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(num_games, reward_shaping_f=np.sign)
        agent.set_env(GymEnvImage,
                      game_name=game,
                      contexts=4,
                      height=im_height,
                      width=im_width,
                      gray=True)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small CNN as the perception net for the Actor-Critic algorithm
    cnn = nn.Sequential(
        nn.Conv2d(d, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
        Flatten(),  # flatten the CNN cube to a vector
        nn.Linear(7 * 7 * 64, 512),
示例#5
0
    num_agents = 16
    num_games = 8000

    env = GymEnv(game)
    state_shape = env.observation_dims()["sensor"]
    num_actions = env.action_dims()["action"]

    # 1. Spawn one agent for each instance of environment.
    #    Agent's behavior depends on the actual algorithm being used. Since we
    #    are using SimpleAC, a proper type of Agent is SimpleRLAgent.
    reward_shaping_f = lambda x: x / 100.0
    agents = []
    for _ in range(num_agents):
        agent = SimpleRLAgent(num_games, reward_shaping_f=reward_shaping_f)
        agent.set_env(GymEnv, game_name=game)
        agents.append(agent)

    # 2. Construct the network and specify the algorithm.
    #    Here we use a small MLP and apply the Actor-Critic algorithm
    mlp = nn.Sequential(nn.Linear(state_shape[0], 128), nn.ReLU(),
                        nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 128),
                        nn.ReLU())

    alg = SimpleAC(model=SimpleModelAC(dims=state_shape,
                                       num_actions=num_actions,
                                       perception_net=mlp),
                   optim=(optim.RMSprop, dict(lr=5e-5)),
                   gpu_id=-1)  ## use cpu

    # 3. Specify the settings for learning: data sampling strategy