load_network(session, variables, NETWORK_FILE_PATH) mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X) def make_training_move(board_state, side): mini_batch_board_states.append(np.ravel(board_state) * side) move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side) mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move.argmax()) for episode_number in range(1, NUMBER_OF_GAMES_TO_RUN): # randomize if going first or second if bool(random.getrandbits(1)): reward = game_spec.play_game(make_training_move, game_spec.get_random_player_func()) else: reward = -game_spec.play_game(game_spec.get_random_player_func(), make_training_move) results.append(reward) last_game_length = len(mini_batch_board_states) - len( mini_batch_rewards) # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) if episode_number % BATCH_SIZE == 0:
board_state, side) return game_spec.flat_move_to_tuple(np.argmax(move)) board_states_training = {} board_states_test = [] episode_number = 0 while len(board_states_training) < TRAIN_SAMPLES + TEST_SAMPLES: board_state = generate_random_board_position() board_state_flat = tuple(np.ravel(board_state)) # only accept the board_state if not already in the dict if board_state_flat not in board_states_training: result = game_spec.play_game(make_move, make_move, board_state=board_state) board_states_training[board_state_flat] = float(result) # take a random selection from training into a test set for _ in range(TEST_SAMPLES): sample = random.choice(board_states_training.keys()) board_states_test.append((sample, board_states_training[sample])) del board_states_training[sample] board_states_training = list(board_states_training.iteritems()) test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test]}) while True: np.random.shuffle(board_states_training)
print("could not find previous weights so initialising randomly") for i in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): if os.path.isfile(BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p'): load_network(session, historical_networks[i][2], BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p') elif os.path.isfile(STARTING_NETWORK_WEIGHTS): # if we can't load a historical file use the current network weights load_network(session, historical_networks[i][2], STARTING_NETWORK_WEIGHTS) for episode_number in range(1, NUMBER_OF_GAMES_TO_PLAY): opponent_index = random.randint(0, NUMBER_OF_HISTORICAL_COPIES_TO_KEEP-1) make_move_historical_for_index = functools.partial(make_move_historical, opponent_index) # randomize if going first or second if bool(random.getrandbits(1)): reward = game_spec.play_game(make_training_move, make_move_historical_for_index) else: reward = -game_spec.play_game(make_move_historical_for_index, make_training_move) results.append(reward) last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards) # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) episode_number += 1 if episode_number % MINI_BATCH_SIZE == 0: