def train_alphazero(lr, dropout, num_channels, epochs, batch_size, replay_buffer_size, temp_decrese_moves, mcts_rollouts, n_episodes_per_iteration, eval, model, test): board = Game(player_turn=1) network = NeuralNet(board, num_channels, lr, dropout, epochs, batch_size) if model is not None: print("Loading {}".format(model)) network.load(model) if test: while True: while board.turn_player() == -1: move = np.argmax(network(board, board.valid_moves())[0][0]) print("Board {}, move {}".format(board.board(), move)) board.move(move) print("{}".format(board.board())) import pdb pdb.set_trace() # set up the experiment experiment = Experiment( api_key=os.environ.get('ALPHAZERO_API_KEY'), project_name=os.environ.get('ALPHAZERO_PROJECT_NAME'), workspace=os.environ.get('ALPHAZERO_WORKSPACE')) experiment.log_multiple_params({ 'lr': lr, 'dropout': dropout, 'num_channels': num_channels, 'epochs': epochs, 'batch_size': batch_size, 'replay_buffer_size': replay_buffer_size, 'temp_decrese_moves': temp_decrese_moves, 'mcts_rollouts': mcts_rollouts, 'n_episodes_per_iteration': n_episodes_per_iteration }) buf = ReplayBuffer(replay_buffer_size, batch_size) epoch = 0 while True: epoch += 1 print("Epoch {}, {}".format(epoch, time.clock())) for i in range(n_episodes_per_iteration): winner = execute_episode(network, buf, experiment) print("Finished episode {}, winner {}, time {}".format( i, winner, time.clock())) network.clone() loss, entropy = train_network(network, buf, experiment) print("Training loss: {}, entropy: {}".format(loss, entropy)) won_counter = evaluate_network(network, board, 10) if won_counter >= 5: print("Performance improved, {} games won".format(won_counter)) network.save() else: print("Performance decreased, {} games won".format(won_counter)) network.revert_network()