示例#1
0
def run_process(args, share_model, board_max, n_rows, rank):

    from checkerboard import Checkerboard, BoardRender
    board = Checkerboard(board_max, n_rows)
    board_render = BoardRender(board_max, render_off=True, inline_draw=True)
    board_render.clear()

    data_buffer = deque(maxlen=100000)

    Ts = []
    Trewards = []
    TQmax = []
    for episode in range(1000):
        random.seed(time.time())
        board.reset()
        board_render.clear()
        """ start a self-play game using a MCTS player, reuse the search tree
        store the self-play data: (state, mcts_probs, z)
        """
        p1, p2 = board.players
        states, mcts_probs, current_players = [], [], []
        for step in range(10000):
            if len(data_buffer) > 32:
                loss, entropy = agent.learn(data_buffer)
#                    print('loss : ',loss,' entropy : ',entropy)

            move, move_probs = agent.get_action(board, temp=1.0, return_prob=1)
            # store the data
            states.append(board.current_state())
            mcts_probs.append(move_probs)
            current_players.append(board.current_player)
            # perform a move
            board.step(move)
            board_render.draw(board.states)
            end, winner = board.game_end()
            if end:
                # winner from the perspective of the current player of each state
                winners_z = np.zeros(len(current_players))
                if winner != -1:
                    winners_z[np.array(current_players) == winner] = 1.0
                    winners_z[np.array(current_players) != winner] = -1.0
                #reset MCTS root node
                agent.reset_player()
                if winner != -1:
                    print("Game end. Winner is player:", winner)
                else:
                    print("Game end. Tie")


#                return winner, zip(states, mcts_probs, winners_z)
                play_data = zip(states, mcts_probs, winners_z)
                ex_play_data = get_equi_data(play_data, board_max, board_max)
                data_buffer.extend(ex_play_data)

                break

        episode += 1
示例#2
0
def run_process(args, share_model, board_max, n_rows, rank):

    from checkerboard import Checkerboard, BoardRender
    board = Checkerboard(board_max, n_rows)
    board_render = BoardRender(board_max, render_off=False, inline_draw=True)
    board_render.clear()
    board_render.draw(board.states)

    for episode in range(1):
        random.seed(time.time())
        board.reset()
        board_render.clear()
        """ start a self-play game using a MCTS player, reuse the search tree
        store the self-play data: (state, mcts_probs, z)
        """
        p1, p2 = board.players
        player = input('select player 1: balck , 2 : white')
        if player == '1':
            play_step = 0
        else:
            play_step = 1
        for step in range(10000):
            if step % 2 == play_step:
                ss = input('input x,y:')
                pos = ss.split(',')
                if pos == 'q':
                    return
                move = int(pos[0]) + int(pos[1]) * board_max
                print('movd ', move)
            else:
                move, move_probs = agent.get_action(board,
                                                    temp=1.0,
                                                    return_prob=1)
            board.step(move)
            board_render.draw(board.states)
            end, winner = board.game_end()
            if end:
                # winner from the perspective of the current player of each state
                agent.reset_player()
                if winner != -1:
                    print("Game end. Winner is player:", winner)
                else:
                    print("Game end. Tie")


#                return winner, zip(states, mcts_probs, winners_z)
                break

        episode += 1
示例#3
0
def human_process(args, share_model, rank, self_play, shared_lr_mul,
                  shared_g_cnt, shared_q, lock):
    print('human play')
    self_play = False
    board_max = args.board_max
    from agent import Agent_MCTS
    agent = Agent_MCTS(args, 5, 800, self_play, shared_lr_mul, shared_g_cnt)
    with lock:
        agent.model_update(share_model)

    from checkerboard import Checkerboard, BoardRender
    board = Checkerboard(board_max, args.n_rows)
    board_render = BoardRender(board_max, render_off=False, inline_draw=True)
    board.reset()
    board_render.clear()
    board_render.draw(board.states)

    p1, p2 = board.players
    player = input('select player 1: balck , 2 : white')
    if player == '1':
        play_step = 1
    else:
        play_step = 0
    for step in range(10000):
        if step // 2 % 2 == play_step:
            ss = input('input x,y:')
            pos = ss.split(',')
            if pos == 'q':
                return
            move = int(pos[0]) + int(pos[1]) * board_max
            print('movd ', move)
        else:
            move, move_probs = agent.get_action(board)
        board.step(move)
        board_render.draw(board.states)
        end, winner = board.game_end()
        if end:
            # winner from the perspective of the current player of each state
            agent.reset_player()
            if winner != -1:
                print("Game end. Winner is player:", winner)
            else:
                print("Game end. Tie")


#                return winner, zip(states, mcts_probs, winners_z)
            return
示例#4
0
def run_process(args, share_model, board_max, rank):

    from checkerboard import Checkerboard
    env = Checkerboard(board_max, args.render)

    from agent import Agent_rainbow
    B_Agent = Agent_rainbow(args)
    W_Agent = Agent_rainbow(args)
    B_Agent.main_dqn = B_share_model
    W_Agent.main_dqn = W_share_model
    B_Agent.optimizer = optim.Adam(B_share_model.parameters(),
                                   lr=args.lr,
                                   eps=args.adam_eps)
    W_Agent.optimizer = optim.Adam(W_share_model.parameters(),
                                   lr=args.lr,
                                   eps=args.adam_eps)

    #    from memory import PER_Memory
    #    memory = PER_Memory(args)
    data_buffer = deque(maxlen=args.memory_capacity)
    """
    main loop
    """
    global_count = 0
    episode = 0

    W_Agent.target_dqn_update()
    B_Agent.target_dqn_update()
    W_Agent.train()
    B_Agent.train()

    Ts = []
    Trewards = []
    TQmax = []
    while episode < args.max_episode_length:
        random.seed(time.time())
        T = 0
        turn = 0
        max_action_value = -999999999999999
        state = env.reset()
        evaluation = False
        total_reward = 0
        if episode % args.evaluation_interval == 0:
            evaluation = True
    #    args.epsilon -= 0.8/args.max_episode_length
        while T < args.max_step:

            action_value = -999999999999999
            if T % 2 == 0:
                Agent_ptr = B_Agent
                turn = env.black
            else:
                Agent_ptr = W_Agent
                turn = env.white

            if not evaluation and (random.random() <= args.epsilon
                                   or global_count < args.learn_start):
                action = env.get_random_xy_flat()
            else:
                action, action_value = Agent_ptr.get_action(state)

            max_action_value = max(max_action_value, action_value)

            next_state, reward, done, _ = env.step_flat(action, turn)

            total_reward += reward
            memory.push(td_error, [state, action, reward, next_state, done])
            state = next_state

            # replay_interval, target_update_interval  only used  odd number
            if not evaluation and global_count % args.replay_interval == 0 and global_count > args.learn_start:
                Agent_ptr.learn(memory)
                Agent_ptr.reset_noise()

            if not evaluation and global_count % args.target_update_interval == 0:
                Agent_ptr.target_dqn_update()

            T += 1
            global_count += 1

            if done:
                B_Agent.reset_noise()
                W_Agent.reset_noise()

                if args.render:
                    env.render()
                break

        if evaluation:
            print('episode : ', episode, '  step : ', T, ' max_action ',
                  max_action_value, 'total_reward : ', total_reward)
            Ts.append(episode)
            Trewards.append([total_reward])
            TQmax.append([max_action_value])
            _plot_line(Ts,
                       Trewards,
                       'rewards_' + args.name + '_' + str(rank),
                       path='results')
            _plot_line(Ts,
                       TQmax,
                       'Q_' + args.name + '_' + str(rank),
                       path='results')
        if episode % args.save_interval == 0:
            print('save')
            B_Agent.save('B' + args.name)
            W_Agent.save('W' + args.name)

        episode += 1
示例#5
0
def act_process(args, share_model, rank, self_play, shared_lr_mul,
                shared_g_cnt, shared_q, lock):
    print(rank)
    board_max = args.board_max

    from agent import Agent_MCTS
    agent = Agent_MCTS(args, 5, 100, self_play, shared_lr_mul, shared_g_cnt)
    from checkerboard import Checkerboard, BoardRender
    board = Checkerboard(board_max, args.n_rows)
    board_render = BoardRender(board_max, render_off=True, inline_draw=False)
    board_render.clear()

    Ts = []
    Tloss = []
    Tentropy = []
    try:
        for episode in range(10000):
            start_time = time.time()

            with lock:
                agent.model_update(share_model)

            random.seed(time.time())
            board.reset()
            board_render.clear()
            board_render.draw(board.states)
            """ start a self-play game using a MCTS player, reuse the search tree
            store the self-play data: (state, mcts_probs, z)
            """
            p1, p2 = board.players
            states, mcts_probs, current_players = [], [], []
            #            list_loss = []
            #            list_entropy = []
            for step in range(10000):
                move, move_probs = agent.get_action(board, temp=1.0)
                # store the data
                states.append(board.current_state())
                mcts_probs.append(move_probs)
                current_players.append(board.current_player)
                # perform a move
                board.step(move)
                board_render.draw(board.states)
                end, winner = board.game_end()
                if end:
                    #                    time.sleep(1)
                    # winner from the perspective of the current player of each state
                    winners_z = np.zeros(len(current_players))
                    if winner != -1:
                        winners_z[np.array(current_players) == winner] = 1.0
                        winners_z[np.array(current_players) != winner] = -1.0
                    #reset MCTS root node
                    agent.reset_player()
                    if winner != -1:
                        print(rank, "Game end. Winner is player:", winner,
                              'total_step :', step, 'time:',
                              time.time() - start_time)
                    else:
                        print(rank, "Game end. Tie", 'total_step :', step,
                              'time:',
                              time.time() - start_time)
    #                return winner, zip(states, mcts_probs, winners_z)
                    play_data = zip(states, mcts_probs, winners_z)
                    ex_play_data = get_equi_data(play_data, board_max,
                                                 board_max)
                    shared_q.put(ex_play_data)
                    break


#            # plot_data
#            if len(data_buffer) > args.batch_size and len(list_loss)!=0:
#
#                Ts.append(episode)
#                Tloss.append(list_loss)
#                Tentropy.append(list_entropy)
#                _plot_line(Ts, Tloss, 'loss', path='./')
#                _plot_line(Ts, Tentropy, 'entropy', path='./')

            episode += 1
    except:
        print(rank, 'except end')