示例#1
0
def train_alphazero(lr, dropout, num_channels, epochs, batch_size,
                    replay_buffer_size, temp_decrese_moves, mcts_rollouts,
                    n_episodes_per_iteration, eval, model, test):
    board = Game(player_turn=1)
    network = NeuralNet(board, num_channels, lr, dropout, epochs, batch_size)
    if model is not None:
        print("Loading {}".format(model))
        network.load(model)
    if test:
        while True:
            while board.turn_player() == -1:
                move = np.argmax(network(board, board.valid_moves())[0][0])
                print("Board {}, move {}".format(board.board(), move))
                board.move(move)
            print("{}".format(board.board()))
            import pdb
            pdb.set_trace()
    # set up the experiment
    experiment = Experiment(
        api_key=os.environ.get('ALPHAZERO_API_KEY'),
        project_name=os.environ.get('ALPHAZERO_PROJECT_NAME'),
        workspace=os.environ.get('ALPHAZERO_WORKSPACE'))
    experiment.log_multiple_params({
        'lr':
        lr,
        'dropout':
        dropout,
        'num_channels':
        num_channels,
        'epochs':
        epochs,
        'batch_size':
        batch_size,
        'replay_buffer_size':
        replay_buffer_size,
        'temp_decrese_moves':
        temp_decrese_moves,
        'mcts_rollouts':
        mcts_rollouts,
        'n_episodes_per_iteration':
        n_episodes_per_iteration
    })
    buf = ReplayBuffer(replay_buffer_size, batch_size)

    epoch = 0
    while True:
        epoch += 1
        print("Epoch {}, {}".format(epoch, time.clock()))
        for i in range(n_episodes_per_iteration):
            winner = execute_episode(network, buf, experiment)
            print("Finished episode {}, winner {}, time {}".format(
                i, winner, time.clock()))
        network.clone()
        loss, entropy = train_network(network, buf, experiment)

        print("Training loss: {}, entropy: {}".format(loss, entropy))
        won_counter = evaluate_network(network, board, 10)
        if won_counter >= 5:
            print("Performance improved, {} games won".format(won_counter))
            network.save()
        else:
            print("Performance decreased, {} games won".format(won_counter))
            network.revert_network()