示例#1
0
def test_PLACE_SHIPS_looks_reasonable_VERTICAL(n=1000):
    w = 10
    h = 10
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset(vert_probability=1)
        seen_in_previous_rows = []
        for col in range(w):
            newly_seen = []
            for row in range(h):
                element = env.state[row][col]
                if element != 0:
                    if element in seen_in_previous_rows:
                        print(
                            "\nBAD!!! Saw a %s in multiple cols. Showing the offending board:" % element)
                        show_ships(env.state)
                        raise Exception(
                            "BAD!!! Saw a %s in multiple col." % element)
                    else:
                        newly_seen.append(element)
            seen_in_previous_rows += newly_seen

    print("\nVERTICAL")
    show_ships(env.state)
示例#2
0
def test_PLACE_SHIPS_looks_reasonable_TOP_RIGHT(n=1000):
    w = 10
    h = 10
    cumulative_ships = np.zeros((h, w), dtype=np.int32)
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset(favor_top=100, favor_right=100)
        cumulative_ships += env.state > 0

    print("\nTOP RIGHT")
    show_ships(cumulative_ships)
示例#3
0
def test_PLACE_SHIPS_looks_reasonable_NO_FAVORS(n=1000):
    w = 10
    h = 10
    cumulative_ships = np.zeros((h, w), dtype=np.int32)
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset()
        cumulative_ships += env.state > 0

    print("\nNO FAVORS")
    show_ships(cumulative_ships)
示例#4
0
def test_PLACE_SHIPS_looks_reasonable_TOP_and_BOTTOM_with_STEEP_GRADIENT(n=1000):
    w = 10
    h = 10
    cumulative_ships = np.zeros((h, w), dtype=np.int32)
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset(favor_top=1, favor_bottom=1, gradient_coef=lambda x: x**10)
        cumulative_ships += env.state > 0

    print("\nTOP and BOTTOM with STEEP GRADIENT")
    show_ships(cumulative_ships)
示例#5
0
def test_PLACE_SHIPS_looks_reasonable_LEFT_with_UNSTEEP_GRADIENT(n=1000):
    w = 10
    h = 10
    cumulative_ships = np.zeros((h, w), dtype=np.int32)
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset(favor_left=10, gradient_coef=lambda x: x**(1/2))
        cumulative_ships += env.state > 0

    print("\nLEFT with UNSTEEP GRADIENT")
    show_ships(cumulative_ships)
示例#6
0
def test_PLACE_SHIPS_looks_reasonable_CENTER_with_STEEP_GRADIENT(n=1000):
    """
    Note: To get a really defined center, the gradient coeff needs to be something that grows slowly like lambda x: x**(1/10)
    """
    w = 10
    h = 10
    cumulative_ships = np.zeros((h, w), dtype=np.int32)
    env = BattleshipEnv(width=w, height=h)

    for _ in range(n):
        env.reset(favor_top=-100000, favor_right=-100000, favor_left=-
                  100000, favor_bottom=-100000, gradient_coef=lambda x: x**(1/1000))
        cumulative_ships += env.state > 0

    print("\nCENTER with STEEP GRADIENT")
    show_ships(cumulative_ships)
示例#7
0
def evaluate_agents(episodes=100):
    env = BattleshipEnv()
    results = pd.DataFrame(columns=[
        'min', 'median', 'max', 'mean', 'std', 'avg_time', 'episodes'
    ])
    print('Agents to be evaluated: {}'.format(list(AGENT_DICT.keys())))
    for agent_name in AGENT_DICT:
        agent = AGENT_DICT[agent_name]()
        shots_fired_totals = []
        start = time.time()
        for _ in tqdm.tqdm(range(episodes),
                           desc='Evaluating {} agent'.format(agent_name, )):
            agent.reset()
            obs = env.reset()
            done = False
            total_reward = 0
            shots_fired = 0
            while not done:
                action = agent.select_action(obs)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                shots_fired += 1
            shots_fired_totals.append(shots_fired)
        # compute statistics
        avg_time = (time.time() - start) / episodes
        min_sf = min(shots_fired_totals)
        max_sf = max(shots_fired_totals)
        median_sf = statistics.median(shots_fired_totals)
        mean_sf = statistics.mean(shots_fired_totals)
        std_sf = statistics.stdev(shots_fired_totals)
        agent_results = pd.Series(data={
            'min': min_sf,
            'median': median_sf,
            'max': max_sf,
            'mean': mean_sf,
            'std': std_sf,
            'avg_time': avg_time,
            'episodes': episodes
        },
                                  name=agent_name)
        results = results.append(agent_results)
    print(results)
示例#8
0
def test_generate_particle_WHEN_board_is_valid(n=1000):
    for check in range(n):
        if check % 10 == 0:
            print(check)

        # Play games up to turn_limit until you get one that isnt done
        done = True
        while done:
            # We're gonna make a real game and run it for some number of turns, take the observation and check a particle
            env = BattleshipEnv()
            agent = RandomBattleshipAgent(delay=0)
            obs = env.reset()

            turn = 0
            turn_limit = random.randrange(0, 100)

            done = False
            while not done and turn <= turn_limit:
                action = agent.select_action(obs)
                obs, reward, done, info = env.step(action)
                turn += 1

        # IMPORTANT: particle will have 0s where there is no ship, and some positive number where there is a ship
        particle = generate_particle(obs, agent.unsunk_ships)

        for i in range(len(obs)):
            for j in range(len(obs[i])):
                cell = obs[i][j]
                if (cell == HIT or cell == SUNK) and particle[i][j] == 0:
                    raise Exception(
                        'The particle is missing a ship on (%s, %s)' % (i, j))

                if cell == MISS and particle[i][j] != 0:
                    raise Exception(
                        'The particle should not have a ship on (%s, %s)' %
                        (i, j))

    print("Found %s particles successfully!" % n)
示例#9
0
def basic_example():
    delay = .1
    env = BattleshipEnv()
    obs = env.reset()
    # agent = RandomWithPreferredActionsAgent(delay=delay)
    agent = ProbabilisticAgent(delay=delay)

    done = False
    total_reward = 0
    while not done:
        action = agent.select_action(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    shots_fired = -total_reward
    print('{} shots fired'.format(shots_fired))
示例#10
0
def Train(board_dimension=BOARD_DIMENSION,
          hidden_size=HIDDEN_SIZE,
          params=DEFAULT_PARAMS):

    q_function = Q_function_CNN(board_dimension=board_dimension,
                                hidden_size=HIDDEN_SIZE)
    q_function.to(device)
    env = BattleshipEnv()

    agent = Q_learning_agent(q_function=q_function,
                             params=params,
                             board_dimension=board_dimension)

    epsilon_func = lambda a: np.float64(max(1 - a * .90 / 1000000, .1))

    num_training_episodes = 50000
    global_step = 0
    game_steps_list = list()
    num_of_shots_history = list()
    num_updates = 1
    C = 10000
    agent.update_target_network()
    test_history = list()

    for episode_num in range(1, num_training_episodes):

        agent.reset()
        obs = env.reset()

        done = False
        total_reward = 0
        agent.params['epsilon'] = epsilon_func(num_updates)

        test = False
        game_steps = 0

        while not done:

            old_obs = obs

            action = agent.select_action(obs, env)
            obs, reward, done, info = env.step(action)
            try:
                #breakpoint()
                agent.add_experience([old_obs, action, reward, obs])
            except:
                breakpoint()
                agent.add_experience([old_obs, action, reward, obs])
            if len(agent.experience['states']) > 500:
                agent.train_batch()
                num_updates += 1
            if num_updates % C == 0:
                agent.update_target_network()

            total_reward += reward
            game_steps += 1
            global_step += 1
        #breakpoint()
        if episode_num % 50 == 0:

            test = True

        num_of_shots_history.append(game_steps)
        if len(game_steps_list) >= 100:
            game_steps_list.pop(0)
            game_steps_list.append(game_steps)
        else:
            game_steps_list.append(game_steps)

        average_shots = np.array(game_steps_list).mean()
        if episode_num % 10 == 0:
            print(
                'EPISODE NUM: {0}, avg # of shots (last 100 games) {1} \t\t\t epsilon: {2}'
                .format(episode_num, average_shots.round(2),
                        agent.params['epsilon'].round(4)))
            pass
        if test:

            avg_test_shots = agent.evaluate(agent, env, episode_num,
                                            average_shots,
                                            agent.params['test_episodes'])

            test_history.append(avg_test_shots)


#Train(10,512)
示例#11
0
def train_q_agent(board_dimension=BOARD_DIMENSION,
                  hidden_size=HIDDEN_SIZE,
                  params=DEFAULT_PARAMS,
                  ships=DEFAULT_SHIPS,
                  use_canon=False,
                  save_name='test_save.pt'):
    q_function = Q_function_CNN(board_dimension=board_dimension,
                                hidden_size=hidden_size)
    q_function.to(device)
    env = BattleshipEnv(board_dimension, board_dimension, ships=ships)

    agent = Q_learning_agent(q_function=q_function,
                             params=params,
                             board_dimension=board_dimension)
    epsilon_func = lambda a: np.float64(max(1 - a * .90 / 200000, .1))

    num_training_episodes = params['num_episodes']
    global_step = 0
    game_steps_list = list()
    num_of_shots_history = list()
    num_updates = 1
    agent.update_target_network()
    test_history = list()
    performance_dict = {'reward_hist': list(), 'avg_shot_hist': list()}

    if use_canon:
        canon = Canonicalizer(board_dimension)
    else:
        canon = None

    for episode_num in range(1, num_training_episodes):
        agent.reset()
        obs = env.reset()
        if use_canon:
            c_obs = canon.canon_obs(obs)

        done = False
        total_reward = 0
        agent.params['epsilon'] = epsilon_func(num_updates)

        test = False
        game_steps = 0

        while not done:
            if use_canon:
                old_c_obs = c_obs
                c_obs, c_tuple = canon.canon_obs(obs, return_tuple=True)
                c_action = agent.select_action(c_obs)
                action = canon.uncanon_action(c_action, c_tuple)
            else:
                old_obs = obs
                action = agent.select_action(obs)
            obs, reward, done, info = env.step(action)
            if done:
                game_steps += 1
                global_step += 1
                total_reward += reward
                break
            if use_canon:
                agent.add_experience([old_c_obs, c_action, reward, c_obs])
            else:
                agent.add_experience([old_obs, action, reward, obs])

            if len(agent.experience['states']) > 500:
                agent.train_batch_base()
                num_updates += 1
            if num_updates % agent.params['C'] == 0:
                agent.update_target_network()

            total_reward += reward
            game_steps += 1
            global_step += 1

        #breakpoint()
        if episode_num % 50 == 0:
            test = True

        num_of_shots_history.append(game_steps)
        if len(game_steps_list) >= 100:
            game_steps_list.pop(0)
            game_steps_list.append(game_steps)
        else:
            game_steps_list.append(game_steps)

        average_shots = np.array(game_steps_list).mean()
        if episode_num % 10 == 0:
            print(
                'EPISODE NUM: {0}, avg # of shots (last 100 games) {1} \t\t\t epsilon: {2}'
                .format(episode_num, average_shots.round(2),
                        agent.params['epsilon'].round(4)))
            pass
        if test:

            avg_test_shots, avg_reward = agent.evaluate(
                env, agent.params['test_episodes'], canon=canon)
            performance_dict['reward_hist'].append(avg_reward)
            performance_dict['avg_shot_hist'].append(avg_test_shots)
            test_history.append(avg_test_shots)
            if use_canon:
                log_fname = '{0}x{0}_hidden{1}_canon_log.json'.format(
                    board_dimension, hidden_size)
            else:
                log_fname = '{0}x{0}_hidden{1}_log.json'.format(
                    board_dimension, hidden_size)

            with open(log_fname, "w") as f:
                json.dump(performance_dict, f)

    if use_canon:
        log_fname = '{0}x{0}_hidden{1}_canon_log.json'.format(
            board_dimension, hidden_size)
    else:
        log_fname = '{0}x{0}_hidden{1}_log.json'.format(
            board_dimension, hidden_size)

    with open(log_fname, "w") as f:
        json.dump(performance_dict, f)

    if use_canon:
        agent.save_agent('{0}x{0}_hidden{1}_episodes{2}_canon_model.pt'.format(
            board_dimension, hidden_size, num_training_episodes))
    else:
        agent.save_agent('{0}x{0}_hidden{1}_episodes{2}_model.pt'.format(
            board_dimension, hidden_size, num_training_episodes))
示例#12
0
        return loss.detach().item()
# =============================================================================
# class Q_data(Dataset):
#     def __init__(self,experience):
#         self.experience=experience
#         
#     def __len__(self):
#         return len(self.experience)
#     def __getitem__(self,index):
#         
#         return None
# =============================================================================

q_function=Q_function_FC(BOARD_DIMENSION**2,BOARD_DIMENSION**2)

env=BattleshipEnv()

params={'alpha':.001,'epsilon':1,'epsilon_decay':.99,'epsilon_length':1000,'replay_size':50000,'gamma':.99,
        'criterion':nn.MSELoss(),'optim': torch.optim.Adam,'num_of_Q_epochs':2}

agent=Q_learning_agent(q_function=q_function,params=params)

epsilon_func=lambda a: np.exp(-a/30000)+.1

num_training_episodes=50000
shots_fired_totals = list()
global_step=0
game_steps_list=list()
num_of_shots_history=list()
#plt.figure()
breakpoint()