Пример #1
0
def play_func(params, net, cuda, exp_queue):
    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    device = torch.device("cuda" if cuda else "cpu")

    writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma,
                                                           steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params.stop_reward) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

    exp_queue.put(None)
Пример #2
0
def play_func(params, net, cuda, exp_queue):
    env = make_env(params)

    writer = SummaryWriter(comment="-" + params['run_name'] +
                           "-05_new_wrappers")
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=cuda)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

    exp_queue.put(None)
Пример #3
0
def play_func(params, net, cuda, fsa, exp_queue, fsa_nvec=None):
    device = torch.device("cuda" if cuda else "cpu")
    env = make_env(params)

    writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers")
    if not fsa:
        selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
        epsilon_tracker = common.EpsilonTracker(selector, params)
        agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa)
    else:
        if 'Index' in net.__class__.__name__:
            selector = ptan.actions.EpsilonGreedyActionSelectorFsa(fsa_nvec, epsilon=params['epsilon_start'])
            epsilon_tracker = common.IndexedEpsilonTracker(selector, params, fsa_nvec)
            agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa, epsilon_tracker=epsilon_tracker)
        else:
            selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
            epsilon_tracker = common.EpsilonTracker(selector, params)
            agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa)
            # epsilon_tracker = common.IndexedEpsilonTrackerNoStates(selector, params, fsa_nvec)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward'], params['telemetry'], params['plot']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            if not fsa or 'Index' not in net.__class__.__name__:
                epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            new_scores = exp_source.pop_total_scores()
            if new_rewards:
                if not fsa or 'Index' not in net.__class__.__name__:
                    new_score = [] if not new_scores else new_scores[0]
                    if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon, params['plot']):
                        break
                else:
                    new_score = [] if not new_scores else new_scores[0]
                    if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon_dict, params['plot']):
                        break

    exp_queue.put(None)
Пример #4
0
def play_func(params, net, cuda, exp_queue):
    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)
    device = torch.device("cuda" if cuda else "cpu")
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    for frame_idx, exp in enumerate(exp_source):
        epsilon_tracker.frame(frame_idx / BATCH_MUL)
        exp_queue.put(exp)
        for reward, steps in exp_source.pop_rewards_steps():
            exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
Пример #5
0
def play_func(params, net, cuda, exp_queue, device_id):
    """
    The paper suggests sampling the actions from the learner net, so that requires little change from the multienv implementation.

    *** There is a reason that it reinitializes the envs in this function that has to do with parallelization ***
    """
    run_name = params['run_name']
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']

    envSI = gym.make('SpaceInvadersNoFrameskip-v4')
    envSI = ptan.common.wrappers.wrap_dqn(envSI)

    envDA = gym.make('DemonAttackNoFrameskip-v4')
    envDA = ptan.common.wrappers.wrap_dqn(envDA)

    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    writer = SummaryWriter(comment="-" + run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ExperienceSourceFirstLast_AM([envSI, envDA],
                                              agent,
                                              gamma=params['gamma'],
                                              steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('mimic_models/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(
                    new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % 500 == 0):
                    # write to disk
                    print("Saving model...")
                    model_name = 'mimic_models/{}_{}.pth'.format(
                        run_name, game_idx)
                    net.to(torch.device('cpu'))
                    torch.save(net, model_name)
                    net.to(device)
                    new_row = [model_name, num_games, mean_reward, epsilon_str]
                    out_csv.writerow(new_row)
                    np.savetxt('mimic_models/{}_reward.txt'.format(run_name),
                               np.array(mean_rewards))
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'mimic_models/{}_{}.pth'.format(run_name, game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt('mimic_models/{}_reward.txt'.format(run_name),
               np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('mimic_models/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
                        action="store_true",
                        help="Enable double dqn")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine, batch):
        optimizer.zero_grad()
        loss_v = calc_loss_double_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
Пример #7
0
def play_func(params, net, cuda, exp_queue, device_id):
    env_name = params['env_name']
    run_name = params['run_name']
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']
    env = gym.make(env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    if 'save_iter' not in params:
        save_iter = 500
    else:
        save_iter = params['save_iter']

    writer = SummaryWriter(comment="-" + params['run_name'] + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('models/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    best_reward = 0
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % save_iter == 0):
                    # write to disk
                    np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards))
                    if mean_reward > best_reward:
                        print("Saving model...")
                        model_name = 'models/{}_{}.pth'.format(run_name, game_idx)
                        torch.save(net, model_name)
                        new_row = [model_name, num_games, mean_reward, epsilon_str]
                        out_csv.writerow(new_row)
                        best_reward = mean_reward
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'models/{}_{}.pth'.format(run_name, game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('models/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
Пример #8
0
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env)

    writer = SummaryWriter(comment="-" + params['run_name'] + "-basic")
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(
        net)  # Target network (copy of net synchronized from time to time)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])  # e-greedy selectoy of actions
    epsilon_tracker = common.EpsilonTracker(
        selector,
        params)  # schedules epsilon according to the current frame number
    agent = ptan.agent.DQNAgent(
        net, selector,
        device=device)  # agent class with Q-network and selector

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1
    )  # generates touples from the environment in the form (s, a, r, s')
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size']
    )  # buffer of experiences for experience replay
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
Пример #9
0
    net_deer = model.DQNModel(
        deer_obs.spaces[0].shape, deer_obs.spaces[1].shape,
        m_env.get_action_space(deer_handle)[0]).to(device)
    tgt_net_deer = ptan.agent.TargetNet(net_deer)
    print(net_deer)

    net_tiger = model.DQNModel(
        tiger_obs.spaces[0].shape, tiger_obs.spaces[1].shape,
        m_env.get_action_space(tiger_handle)[0]).to(device)
    tgt_net_tiger = ptan.agent.TargetNet(net_tiger)
    print(net_tiger)

    action_selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=PARAMS.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(action_selector, PARAMS)
    preproc = model.MAgentPreprocessor(device)

    agent = model.GroupDQNAgent([net_deer, net_tiger],
                                action_selector,
                                device,
                                preprocessor=preproc)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           PARAMS.gamma,
                                                           vectorized=True)
    deer_buffer = ptan.experience.ExperienceReplayBuffer(
        None, PARAMS.replay_size)
    tiger_buffer = ptan.experience.ExperienceReplayBuffer(
        None, PARAMS.replay_size)
    deer_optimizer = optim.Adam(net_deer.parameters(), lr=PARAMS.learning_rate)
Пример #10
0
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay")
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'],
                    steps_count=1)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta)
            loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model,
                                                params["gamma"], cuda=args.cuda)
            loss_v.backward()
            optimizer.step()
            buffer.update_priorities(batch_indices, sample_prios)

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
Пример #11
0
def play_func(params, net, cuda, exp_queue, device_id):
    """
    With multiple envs, the exp_source class will return experiences
    (defined as a tuple of (state_framestack, action, reward, last_state_framestack) alternating between
    the two environments. Otherwise it returns just experinces from a single env. Even if the games have different
    frame shapes, they will by reduced to 84x84

    *** There is a reason that it reinitializes the envs in this function that has to do with parallelization ***
    """
    run_name = 'demon_invaders'
    if 'max_games' not in params:
        max_games = 16000
    else:
        max_games = params['max_games']

    envSI = gym.make('SpaceInvadersNoFrameskip-v4')
    envSI = ptan.common.wrappers.wrap_dqn(envSI)

    envDA = gym.make('DemonAttackNoFrameskip-v4')
    envDA = ptan.common.wrappers.wrap_dqn(envDA)

    device = torch.device("cuda:{}".format(device_id) if cuda else "cpu")

    if 'save_iter' not in params:
        save_iter = 500
    else:
        save_iter = params['save_iter']

    writer = SummaryWriter(comment="-" + run_name + "-03_parallel")

    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        [envSI, envDA], agent, gamma=params['gamma'], steps_count=1)
    exp_source_iter = iter(exp_source)

    fh = open('models_multi/{}_metadata.csv'.format(run_name), 'w')
    out_csv = csv.writer(fh)

    frame_idx = 0
    game_idx = 1
    model_count = 0
    model_stats = []
    mean_rewards = []
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            exp = next(exp_source_iter)
            exp_queue.put(exp)

            epsilon_tracker.frame(frame_idx)
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                status, num_games, mean_reward, epsilon_str = reward_tracker.reward(
                    new_rewards[0], frame_idx, selector.epsilon)
                mean_rewards.append(mean_reward)
                if status:
                    break
                if game_idx and (game_idx % save_iter == 0):
                    # write to disk
                    print("Saving model...")
                    model_name = 'models_multi/{}_{}_{}.pth'.format(
                        run_name, params['secondary'], game_idx)
                    net.to(torch.device('cpu'))
                    torch.save(net, model_name)
                    net.to(device)
                    new_row = [model_name, num_games, mean_reward, epsilon_str]
                    out_csv.writerow(new_row)
                    np.savetxt(
                        'models_multi/{}_{}_reward.txt'.format(
                            run_name, params['secondary']),
                        np.array(mean_rewards))
                if game_idx == max_games:
                    break
                game_idx += 1

    print("Saving final model...")
    model_name = 'models_multi/{}_{}_{}.pth'.format(run_name,
                                                    params['secondary'],
                                                    game_idx)
    net.to(torch.device('cpu'))
    torch.save(net, model_name)
    net.to(device)
    new_row = [model_name, num_games, mean_reward, epsilon_str]
    out_csv.writerow(new_row)
    np.savetxt(
        'models_multi/{}_{}_reward.txt'.format(run_name, params['secondary']),
        np.array(mean_rewards))
    # plt.figure(figsize=(16, 9))
    # plt.tight_layout()
    # plt.title('Reward vs time, {}'.format(run_name))
    # plt.xlabel('Iteration')
    # plt.ylabel('Reward')
    # ys = np.array(mean_rewards)
    # plt.plot(ys, c='r')
    # plt.savefig('models_multi/{}_reward.png'.format(run_name))
    # plt.close()
    fh.close()

    exp_queue.put(None)
Пример #12
0
def main():
    global params_save_file

    game = 'revenge'
    params_save_file += '-' + game
    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()

    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-dqfd(PDD DQN)")
    net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    demo_data = demo_data_reader.get_demo_data(env, game, num_states=params['demo_size'], skip=params['skip-frames'])
    exp_source = ptan.experience.ExperienceSourceNFirstLast(env, agent, gamma=params['gamma'],
                    steps_count=params['n-steps'], demo_data=demo_data)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    buffer.populate_demo_data()
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], weight_decay=L2_REG_LAMBDA)

    print("Demo data size: {}".format(buffer.demo_samples))
    sys.stdout.flush()

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            if frame_idx > params['pretrain_steps']:
                buffer.populate(params['steps'])
            else:
                if frame_idx % 500 == 0:
                    writer.add_scalar("beta", beta, frame_idx)
                    reward_tracker.record_training(frame_idx, selector.epsilon, last_dq_losses, last_n_losses,
                        last_e_losses, last_demo_sizes)

            epsilon_tracker.frame(frame_idx)
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses,
                    last_n_losses, last_e_losses, last_demo_sizes):
                    break

            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta)
            batch_demo_mask = (np.array(batch_indices) < buffer.demo_samples).astype(np.uint8)

            loss_v, sample_prios = calc_loss(batch, batch_demo_mask, batch_weights, net, tgt_net.target_model,
                                                params["gamma"], params["gamma"] ** params['n-steps'],
                                                cuda=args.cuda)
            loss_v.backward()
            optimizer.step()

            buffer.update_priorities(batch_indices, sample_prios)

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))
Пример #13
0
def main():
    global params_save_file

    game = 'spaceinvaders'
    params_save_file += '-' + game

    params = config.HYPERPARAMS[game]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN")
    args = parser.parse_args()

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames'])

    print("Parameters:")
    print(params)
    sys.stdout.flush()

    writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double))
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    if args.cuda:
        net.cuda()

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    eval_states = None

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += params['steps']
            buffer.populate(params['steps'])
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break

            if len(buffer) < params['replay_initial']:
                continue
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
                eval_states = np.array(eval_states, copy=False)

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'] * params['steps'])
            loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda,
                               double=args.double)
            loss_v.backward()
            optimizer.step()

            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()
            if frame_idx % EVAL_EVERY_FRAME == 0:
                mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda)
                writer.add_scalar("values_mean", mean_val, frame_idx)

            if frame_idx % params['save_params_every'] == 0:
                torch.save(net.state_dict(), params_save_file + str(frame_idx))

    torch.save(net.state_dict(), params_save_file + str(frame_idx))