예제 #1
0
def play_func(params, net, cuda, exp_queue):
    env = make(params.env_name)
    env = wrap_dqn(env)
    env.seed(common.SEED)
    device = torch.device("cuda" if cuda else "cpu")
    selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = DQNAgent(net, selector, device=device)
    exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma)

    for frame_idx, exp in enumerate(exp_source):
        epsilon_tracker.frame(frame_idx / BATCH_MUL)
        exp_queue.put(exp)
        for reward, steps in exp_source.pop_rewards_steps():
            exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
예제 #2
0
                        type=int,
                        default=DEFAULT_N_STEPS,
                        help="steps to do on Bellman unroll")
    args = parser.parse_args()
    device = device("cuda" if args.cuda else "cpu")

    env = make(params.env_name)
    env = wrap_dqn(env)
    env.seed(123)
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = TargetNet(net)

    selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = DQNAgent(net, selector, device=device)
    exp_source = ExperienceSourceFirstLast(env,
                                           agent,
                                           gamma=params.gamma,
                                           steps_count=args.n)
    buffer = ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size)
    optimizer = Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss = common.calc_loss_dqn(batch,
                                    net,
                                    tgt_net.target_model,
                                    gamma=params.gamma**args.n,
                                    device=device)
        loss.backward()
                        help='Enter the number of steps to unroll bellman eq')
    args = parser.parse_args()

    print('Starting...')
    params = HYPERPARAMS['pong']
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Running on Device {}'.format(device))
    writer = writer = SummaryWriter(comment="-" + params['run_name'] +
                                    "-%d-step noisy-net" % args.n)
    env = gym.make(params['env_name'])
    env = wrappers.wrap_dqn(env)
    # print(env.observation_space.shape, env.action_space.n)
    net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)
    target_net = TargetNet(net)

    agent = DQNAgent(net, ArgmaxActionSelector(), device)

    experience_source = ExperienceSourceFirstLast(env,
                                                  agent,
                                                  params['gamma'],
                                                  steps_count=args.n)
    buffer = ExperienceReplayBuffer(experience_source,
                                    buffer_size=params['replay_size'])

    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
    frame_idx = 0
    with RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            # get latest rewards
예제 #4
0
                        default=True,
                        action="store_true",
                        help="Enable cuda")
    args = parser.parse_args()
    device = device("cuda" if args.cuda else "cpu")

    env = make(params.env_name)
    env = wrap_dqn(env)
    env.seed(123)
    net = dqn_extra.DistributionalDQN(env.observation_space.shape,
                                      env.action_space.n).to(device)
    tgt_net = TargetNet(net)

    selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = DQNAgent(lambda x: net.qvals(x), selector, device=device)
    exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma)
    buffer = ExperienceReplayBuffer(exp_source, buffer_size=params.replay_size)
    optimizer = Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss = calc_loss(batch,
                         net,
                         tgt_net.target_model,
                         gamma=params.gamma,
                         device=device)
        loss.backward()
        optimizer.step()
        epsilon_tracker.frame(engine.state.iteration)
        if engine.state.iteration % params.target_net_sync == 0:
    last_states_q_v = net(last_states_v)
    best_last_q_v = last_states_q_v.max(dim=1)[0]
    best_last_q_v[done_masks] = 0.0
    return states_v, actions_v, best_last_q_v * gamma + rewards_v


if __name__ == "__main__":
    env = make("CartPole-v0")
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    tgt_net = TargetNet(net)
    selector = ArgmaxActionSelector()
    selector = EpsilonGreedyActionSelector(epsilon=1, selector=selector)
    agent = DQNAgent(net, selector)
    exp_source = ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
    buffer = ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    optimizer = Adam(net.parameters(), LR)

    step = episode = 0
    solved = False
    while True:
        step += 1
        buffer.populate(1)
        for reward, steps in exp_source.pop_rewards_steps():
            episode += 1
            print(f"{step}: episode {episode} done, {reward=:.3f}, epsilon={selector.epsilon:.2f}")
            solved = reward > 150
        if solved:
            print("YAY!")