def test_merchant(self) -> None:
        self.game.new_game()

        p_state: PlayerState = self.game.state.player_states[0]

        # Inject cards into hand
        merchant = Merchant()
        first_silver = Silver()
        second_silver = Silver()
        self.game.state.inject(0, merchant)
        self.game.state.inject(0, first_silver)
        self.game.state.inject(0, second_silver)
        self.game.state.inject(0, Estate())
        self.game.state.inject(0, Estate())

        self.game.state.advance_next_decision()

        # Action Phase Decision -- Play Merchant
        r = DecisionResponse([merchant])
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        # Treasure Phase Decision -- Play All Treasures
        r = DecisionResponse([first_silver])
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        r = DecisionResponse([second_silver])
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        self.assertEqual(p_state.coins, 5)
    def testChapelHeuristic(self) -> None:
        self.game.new_game()
        state: State = self.game.state

        state.inject(0, Chapel())
        state.advance_next_decision()

        # Action Phase decision: defaults to playing Chapel
        r: DecisionResponse = DecisionResponse([])
        self.players[0].makeDecision(state, r)
        self.game.state.process_decision(r)

        self.game.state.advance_next_decision()

        # Should auto trash 3 Copper and 1 Estate
        r = DecisionResponse([])
        self.players[0].makeDecision(state, r)
        self.game.state.process_decision(r)

        # Process TrashCard events
        self.game.state.advance_next_decision()

        n_copper = state.get_card_count(0, Copper)
        n_estate = state.get_card_count(0, Estate)
        self.assertTrue(n_copper == 3 or n_copper == 4)
        self.assertEqual(n_copper + n_estate, 6)
示例#3
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        p: int = s.player
        if s.phase == Phase.ActionPhase:
            assert False, 'GreedyPlayer does not support action cards yet'
        elif s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
        else:
            choices = d.card_choices + [None]

            X = s.lookahead_batch_featurize(choices).cpu()

            label_idx = np.argmin(
                self.model.classes_) if p == 1 else np.argmax(
                    self.model.classes_)

            y = self.model.predict_proba(X)

            if self.train:
                card = np.random.choice(choices,
                                        p=softmax(y[:, label_idx], t=self.tau))
            else:
                card = choices[np.argmax(y[:, label_idx])]

            response.single_card = card
    def testGreedyActionHeuristic(self) -> None:
        self.game.new_game()
        state: State = self.game.state

        state.inject(0, Laboratory())
        state.inject(0, Village())
        state.inject(0, Smithy())

        state.advance_next_decision()

        # Action Phase: Play Lab
        r = DecisionResponse([])
        self.players[0].makeDecision(state, r)
        self.assertTrue(isinstance(r.cards[0], Laboratory))
        state.process_decision(r)

        state.advance_next_decision()

        r = DecisionResponse([])
        self.players[0].makeDecision(state, r)
        self.assertTrue(isinstance(r.cards[0], Village))
        state.process_decision(r)

        state.advance_next_decision()

        r = DecisionResponse([])
        self.players[0].makeDecision(state, r)
        self.assertTrue(isinstance(r.cards[0], Smithy))
        state.process_decision(r)

        state.advance_next_decision()

        self.assertEqual(state.get_zone_card_count(0, Zone.Hand), 10)
示例#5
0
 def makePhaseDecision(self, s: State, response: DecisionResponse):
     d: DecisionState = s.decision
     player = d.controlling_player
     if s.phase == Phase.ActionPhase:
         self.heuristic.makeGreedyActionDecision(s, response)
     elif s.phase == Phase.TreasurePhase:
         response.single_card = d.card_choices[0]
     else:
         if not self.train:
             remove_first_card(Curse(), d.card_choices)
         response.single_card = self.heuristic.agenda.buy(
             s, player, d.card_choices)
     return
示例#6
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        if s.phase == Phase.ActionPhase:
            assert False, 'MCTS does not support action cards yet'
        elif s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
        else:
            choices = d.card_choices + [None]

            # the next node in the tree is the one that maximizes the UCB1 score
            card = self.rollout.select(choices, state=s)

            response.single_card = card
示例#7
0
    def step(self, action: DecisionResponse) -> Tuple[State, int, bool, Any]:
        s: State = self.game.state
        d: DecisionState = s.decision

        if s.phase != Phase.BuyPhase:
            raise ValueError('Cannot step from any phase other than Buy Phase.')

        p: Player = self.game.players[d.controlling_player].controller

        s.process_decision(action)

        s.advance_next_decision()

        # Skip all non-Buy phases until end of game
        while s.phase != Phase.BuyPhase and not self._done:
            response = DecisionResponse([])
            p = self.game.players[d.controlling_player].controller
            p.makeDecision(s, response)
            s.process_decision(response)
            s.advance_next_decision()

        reward = 0
        if self._done:
            p0win = self.game.is_winner(0)
            p1win = self.game.is_winner(1)
            if p0win and p1win:
                reward = 0
            elif p0win:
                reward = 1
            else:
                reward = -1

        return s, reward, self._done, None
def train_elog(env: Environment, epochs: int, train_epochs_interval: int):
    for epoch in tqdm(range(epochs)):
        state = env.reset()
        done = False
        data = {
            'features': [],
            'rewards': [],
            'cards': [],
            'idxs': state.feature.idxs
        }
        while not done:
            action = DecisionResponse([])
            d: DecisionState = state.decision
            player: Player = env.players[d.controlling_player]

            player.makeDecision(state, action)

            x = state.feature.to_numpy()
            data['features'].append(x)
            data['cards'].append(action.single_card)

            obs, reward, done, _ = env.step(action)

        data['rewards'].extend([reward] *
                               (len(data['features']) - len(data['rewards'])))

        for player in env.players:
            if isinstance(player, RolloutPlayer):
                player.rollout.update(**data)
                if (epoch + 1) % train_epochs_interval == 0:
                    player.rollout.learn()
示例#9
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        player: int = d.controlling_player
        if d.type != DecisionType.DecisionSelectCards and d.type != DecisionType.DecisionDiscreteChoice:
            logging.error('Invalid decision type')
        if not d.active_card:
            self.makePhaseDecision(s, response)
        elif s.events:
            event = s.events[-1]
            if isinstance(event, PutOnDeckDownToN):
                self.heuristic.makePutDownOnDeckDecision(s, response)
            elif isinstance(event, DiscardDownToN):
                self.heuristic.makeDiscardDownDecision(s, response)
            elif isinstance(event, RemodelExpand):
                if not event.trashed_card:

                    def scoringFunction(card: Card):
                        if isinstance(card, Curse):
                            return 19
                        elif isinstance(card, Estate):
                            return 18
                        elif isinstance(card, VictoryCard):
                            return -200 + card.get_coin_cost()
                        return -card.get_coin_cost()

                    response.cards = heuristic_select_cards(
                        d.card_choices, d.min_cards, scoringFunction)
                else:
                    response.cards.append(
                        self.heuristic.agenda.forceBuy(s, player,
                                                       d.card_choices))
        else:
            self.heuristic.makeBaseDecision(s, response)
示例#10
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        p: int = s.player
        if s.phase == Phase.ActionPhase:
            assert False, 'MCTS does not support action cards yet'
        elif s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
        else:
            vals = []
            choices = d.card_choices + [None]

            X = s.lookahead_batch_featurize(choices)
            vals = self.model(X).detach().cpu().numpy()

            choice = self.select(p, choices, vals)
            response.single_card = choice
示例#11
0
    def makeCopyDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision

        def scoringFunction(card: Card):
            return card.get_coin_cost()

        response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
示例#12
0
    def run(self):
        s = self.game.state
        d: DecisionState = s.decision
        tree_score = 0
        # run the game up to game end or turn limit reached
        while d.type != DecisionType.DecisionGameOver and s.player_states[
                0]._turns < self.T:
            if d.text:
                logging.info(d.text)
            response = DecisionResponse([])
            player = self.game.players[d.controlling_player]
            next_node = player.controller.makeDecision(s, response)

            if s.phase == Phase.BuyPhase:
                # apply selection until leaf node is reached
                if next_node:
                    assert next_node == self.player.node
                    self.player.node.n += 1
                elif not self.expanded:
                    # expand one node
                    cards = list(
                        filter(lambda x: not isinstance(x, Curse),
                               d.card_choices + [None]))
                    self.player.node.add_unique_children(cards)
                    self.expanded = True
                    self.player.node = self.player.node.get_child_node(
                        response.single_card)
                    self.player.node.n += 1
                    # Uncomment to track UCT score within the tree
                    tree_score = self.game.get_player_scores()[0]
                    self.data.update_split_scores(tree_score, False, self.iter)
                elif self.rollout_model == Rollout.HistoryHeuristic:
                    self.rollout_cards.append(response.single_card)

            s.process_decision(response)
            s.advance_next_decision()

        score = self.game.get_player_scores()[0]
        # update data
        self.data.update_split_scores(score - tree_score, True, self.iter)

        # backpropagate
        delta = score
        self.player.node.v += delta
        self.player.node = self.player.node.parent
        while self.player.node != self.player.root:
            self.player.node.update_v(lambda x: sum(x) / len(x))
            self.player.node = self.player.node.parent

        # update history heuristic
        if self.rollout_model == Rollout.HistoryHeuristic:
            self.rollout.update(cards=self.rollout_cards, score=score)
        elif self.rollout_model == Rollout.LinearRegression:
            counts = self.game.state.get_card_counts(0)
            self.rollout.update(counts=counts, score=score, i=self.iter)

        return self.game.get_player_scores()[0]
示例#13
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        if s.phase == Phase.ActionPhase:
            if not d.active_card:
                self.heuristic.makeGreedyActionDecision(s, response)
            elif s.events:
                event = s.events[-1]
                if isinstance(event, DiscardDownToN):
                    self.heuristic.makeDiscardDownDecision(s, response)
                elif isinstance(event, MoatReveal):
                    self.heuristic.makeBaseDecision(s, response)
                else:
                    raise ValueError(f'Event {type(event)} not supported')
            else:
                self.heuristic.makeBaseDecision(s, response)

        elif s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
        else:
            # Remove Curse
            choices = list(
                filter(lambda x: not isinstance(x, Curse),
                       d.card_choices + [None]))

            # Rollout (out-of-tree) case; tree actually isn't that good
            if not self.tree.in_tree or not self.use_tree:
                logging.log(level=BUY, msg='Rollout')
                response.single_card = self.rollout.select(choices, state=s)
                return

            # the next node in the tree is the one that maximizes the UCB1 score
            try:
                # Remove Copper and Victory cards -- tree never gets that deep anyways
                tree_choices = list(
                    filter(
                        lambda x: not isinstance(x, Copper) and not issubclass(
                            type(x), VictoryCard), choices))
                card = self.tree.select(tree_choices)
                logging.log(level=BUY, msg=f'Selection: {self.tree.node.n}')
            except ValueError:
                card = self.rollout.select(choices, state=s)

            response.single_card = card
示例#14
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        p: int = s.player
        if s.phase == Phase.ActionPhase:
            assert False, 'GreedyMLPPlayer does not support action cards yet'
        elif s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
        else:
            choices = d.card_choices + [None]

            X = s.lookahead_batch_featurize(choices)

            label_idx = 0 if p == 1 else 2

            y_pred = self.model.forward(X)

            card_idx = torch.argmax(y_pred[:, label_idx])

            response.single_card = choices[card_idx]
示例#15
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        if s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
            return

        if d.type == DecisionType.DecisionSelectCards:
            cardsToPick = -1
            d.print_card_choices()
            while (cardsToPick < d.min_cards or cardsToPick > d.max_cards):
                text = ''
                while not text:
                    text = input(
                        f'Pick between {d.min_cards} and {d.max_cards} of the above cards:\n'
                    )
                cardsToPick = int(text)

            responseIdxs = []
            for i in range(cardsToPick):
                cardIdx = -1
                while (cardIdx == -1 or cardIdx in responseIdxs
                       or cardIdx >= len(d.card_choices)):
                    d.print_card_choices()
                    text = ''
                    while not text:
                        text = input('Choose another card:\n')
                    cardIdx = int(text)
                responseIdxs.append(cardIdx)
                response.cards.append(d.card_choices[cardIdx])
        elif d.type == DecisionType.DecisionDiscreteChoice:
            choice = -1
            while choice == -1 or choice > d.min_cards:
                text = ''
                while not text:
                    text = input(
                        'Please make a discrete choice from the above cards:\n'
                    )
                choice = int(text)
                d.print_card_choices()
            response.choice = choice
        else:
            logging.error(f'Player {s.player} given invalid decision type.')
def simulate(env: Environment, n: int, tree: GameTree, turn_log=False, action_log=False, card_log=False) -> SimulationData:
    # TODO: Fix this shit
    sim_data = SimulationData(Supply(env.config).get_supply_card_types())

    for i in tqdm(range(n)):
        state: State = env.reset()
        if tree:
            tree.reset(state)
        done = False
        t_start = time.time()
        starting_player_buy = None
        while not done:
            action: DecisionResponse = DecisionResponse([])
            d: DecisionState = state.decision
            pid: int = d.controlling_player
            player = env.players[pid]
            player.makeDecision(state, action)

            if state.phase == Phase.ActionPhase:
                # +1 to turns to get current turn
                sim_data.update_action(i, pid, state.player_states[pid].turns + 1, action.cards[0])

            if state.phase == Phase.BuyPhase and tree:
                tree.advance(action.single_card)

            log_buy = (state.phase == Phase.BuyPhase)

            obs, reward, done, _ = env.step(action)

            if turn_log and log_buy:
                if pid == 0:
                    starting_player_buy = action.single_card
                else:
                    sim_data.update_turn(i, 0, state.player_states[0].turns, state.get_player_score(0), starting_player_buy, state.get_coin_density(0))
                    sim_data.update_turn(i, 1, state.player_states[1].turns, state.get_player_score(1), action.single_card, state.get_coin_density(1))
            if card_log and log_buy:
                if pid == 1:
                    sim_data.update_card(i, 0, state.player_states[0].turns, state.get_card_counts(0))
                    sim_data.update_card(i, 1, state.player_states[1].turns, state.get_card_counts(1))

        if state.player_states[0].turns > state.player_states[1].turns:
            sim_data.update_card(i, 0, state.player_states[0].turns, state.get_card_counts(0))
            sim_data.update_turn(i, 0, state.player_states[0].turns, state.get_player_score(0), starting_player_buy, state.get_coin_density(0))

        t_end = time.time()
        sim_data.update(env.game, t_end - t_start)

    sim_data.finalize(env.game)

    print('===SUMMARY===')
    print(sim_data.summary)

    return sim_data
示例#17
0
    def makeDiscardDownDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision

        def scoringFunction(card: Card):
            if isinstance(card, VictoryCard):
                return 20
            elif isinstance(card, Curse):
                return 19
            elif isinstance(card, Copper):
                return 18
            return -card.get_coin_cost()

        response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
示例#18
0
 def run(self, T=None):
     d = self.state.decision
     self.state.advance_next_decision()
     while d.type != DecisionType.DecisionGameOver:
         if T is not None and all(t.turns >= T
                                  for t in self.state.player_states):
             break
         if d.text:
             logging.info(d.text)
         response = DecisionResponse([])
         player = self.players[self.state.decision.controlling_player]
         player.controller.makeDecision(self.state, response)
         self.state.process_decision(response)
         self.state.advance_next_decision()
示例#19
0
    def makeDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision

        # Do not allow RandomPlayer to purchase curses
        if s.phase == Phase.BuyPhase and not self.train:
            remove_first_card(Curse(), d.card_choices)

        # Ensure random player plays all treasures
        if s.phase == Phase.TreasurePhase:
            response.single_card = d.card_choices[0]
            return

        if d.type == DecisionType.DecisionSelectCards:
            cards_to_pick = d.min_cards
            if d.max_cards > d.min_cards:
                cards_to_pick = random.randint(d.min_cards, d.max_cards)

            response.cards = random.sample(d.card_choices,
                                           k=min(cards_to_pick,
                                                 len(d.card_choices)))
        elif d.type == DecisionType.DecisionDiscreteChoice:
            response.choice = random.randint(0, d.min_cards)
        else:
            logging.error('Invalid decision type')
示例#20
0
    def makeGreedyActionDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision
        assert d.min_cards == 0 and d.max_cards == 1, 'Invalid decision parameters'

        def scoringFunction(card: Card):
            '''Play all cantrips first, then greedily'''
            cantrip_bonus = 7
            score = min(card.get_coin_cost(), 6)

            if is_cantrip(card):
                score += cantrip_bonus

            return score

        cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
        response.cards = cards
示例#21
0
    def makePutDownOnDeckDecision(self, s: State, response: DecisionResponse):
        d: DecisionState = s.decision

        def scoringFunction(card: Card):
            if has_excess_actions(s.decision.card_choices):
                if isinstance(card, ActionCard):
                    return 100 - card.get_plus_actions()
                return -card.get_coin_cost()
            elif has_treasure_cards(s.decision.choices):
                if isinstance(card, TreasureCard):
                    return 100 - card.get_treasure()
                return -card.get_coin_cost()
            else:
                return -card.get_coin_cost()

        response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
示例#22
0
    def reset(self, **kwargs) -> State:
        if self.randomize_player_order:
            np.random.shuffle(self.players)
        self.game = Game(self.config, self.players)
        self.game.new_game()
        self.game.state.advance_next_decision()

        s: State = self.game.state
        d: DecisionState = s.decision

        while s.phase != Phase.BuyPhase and not self._done:
            response = DecisionResponse([])
            p = self.game.players[d.controlling_player].controller
            p.makeDecision(s, response)
            s.process_decision(response)
            s.advance_next_decision()

        return self.game.state
    def test_vassal_effect_play_action(self) -> None:
        self.game.new_game()
        p_state: PlayerState = self.game.state.player_states[0]
        opp_state: PlayerState = self.game.state.player_states[1]
        card = Bandit()
        p_state._deck[-1] = card
        first_discarded = opp_state._deck[-1]
        second_discarded = opp_state._deck[-2]
        effect = VassalEffect()

        # Play Bandit
        r = DecisionResponse([], 1)
        effect.play_action(self.game.state)
        self.game.state.advance_next_decision()
        self.game.state.process_decision(r)

        # Process Bandit events
        self.game.state.advance_next_decision()
        self.assertIn(card, p_state._play_area)
        self.assertIn(first_discarded, opp_state._discard)
        self.assertIn(second_discarded, opp_state._discard)
示例#24
0
def sample_training_batch(n: int, p: float, config: GameConfig, players: Iterable[Player], win_loss=False) -> Tuple[np.array, np.array]:
    env = DefaultEnvironment(config, players)
    X = []
    y = []

    rng = np.random.default_rng()

    print('Generating training data from self-play...')
    for epoch in tqdm(range(n)):
        state: State = env.reset()
        done = False
        while not done:
            action = DecisionResponse([])
            d = state.decision
            player = players[d.controlling_player]
            player.makeDecision(state, action)
            obs, reward, done, _ = env.step(action)

            feature = obs.feature.to_numpy()
            if p <= 1 and p > 0:
                if rng.uniform(0, 1) < p:
                    X.append(feature)
            else:
                if obs.player_states[d.controlling_player].turns < p:
                    X.append(feature)

        if p <= 0:
            X.append(feature)

        y.extend([reward] * (len(X) - len(y)))

    y = np.array(y)

    if win_loss:
        y[y == -1] = 0

    return np.array(X), y
示例#25
0
    def test_event_sentry(self) -> None:
        self.game.new_game()

        # Inject Sentry in player's hand
        sentry = Sentry()

        self.game.state.inject(0, sentry)

        self.game.state.advance_next_decision()

        # Action Phase Decision
        r = DecisionResponse([])
        r.cards = [sentry]
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        # Choose to trash one card
        d = self.game.state.decision
        trashed = d.card_choices[0]
        r = DecisionResponse([trashed])
        self.game.state.process_decision(r)
        # Trash card
        self.game.state.advance_next_decision()

        self.assertEqual(self.game.state.trash, [trashed])

        # Choose to discard one card
        d = self.game.state.decision
        discarded = d.card_choices[0]
        r = DecisionResponse([discarded])
        self.game.state.process_decision(r)
        # Discard card
        self.game.state.advance_next_decision()

        d = self.game.state.decision
        p_state: PlayerState = self.game.state.player_states[0]
        self.assertEqual(p_state._discard, [discarded])
        self.assertIsNone(d.active_card)
示例#26
0
    def test_moat_reveal(self) -> None:
        self.game.new_game()

        # Inject necessary cards into players' hands
        attack_card = Militia()
        moat_card = Moat()
        self.game.state.inject(0, attack_card)
        self.game.state.inject(1, moat_card)

        self.game.state.advance_next_decision()

        # Action Phase decision
        r = DecisionResponse([])
        r.cards = [attack_card]
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        # MoatReveal reaction
        r = DecisionResponse([])
        r.choice = 0
        self.game.state.process_decision(r)
        self.game.state.advance_next_decision()

        self.assertEqual(self.game.state.events, [])
示例#27
0
 def makeBaseDecision(self, s: State, response: DecisionResponse):
     d: DecisionState = s.decision
     card = d.active_card
     player = s.decision.controlling_player
     p_state: PlayerState = s.player_states[player]
     if isinstance(card, Cellar):
         num_discarded = 0
         for c in d.card_choices:
             if isinstance(c, VictoryCard) or c.get_coin_cost() < 2:
                 response.cards.append(c)
     elif isinstance(card, Chapel):
         treasureValue = s.get_total_coin_count(player)
         trashCoppers = (treasureValue > 3)
         num_discarded = 0
         for c in d.card_choices:
             trashCoppers = (treasureValue > 3)
             if num_discarded == 4:
                 break
             if isinstance(c, Curse):
                 response.cards.append(c)
                 num_discarded += 1
             elif isinstance(c, Copper) and trashCoppers:
                 response.cards.append(c)
                 num_discarded += 1
                 treasureValue -= 1
             elif isinstance(c, Estate):
                 response.cards.append(c)
                 num_discarded += 1
             elif isinstance(c, Chapel):
                 response.cards.append(c)
                 num_discarded += 1
     elif isinstance(card, Moat):
         response.choice = 0
     elif isinstance(card, Bureaucrat):
         response.cards.append(d.card_choices[0])
     elif isinstance(card, Militia):
         self.makeDiscardDownDecision(s, response)
     elif isinstance(card, ThroneRoom):
         self.makeCopyDecision(s, response)
     elif isinstance(card, Library):
         if s.player_states[s.player].actions == 0:
             response.choice = 0
         else:
             response.choice = 1
     elif isinstance(card, Mine):
         event = s.events[-1]
         if not event.trashed_card:
             def scoringFunction(card: Card):
                 if isinstance(card, Gold) and s.supply[Gold] > 0:
                     return 20
                 if isinstance(card, Silver) and s.supply[Silver] > 0:
                     return 19
                 if isinstance(card, Copper) and s.supply[Copper] > 0:
                     return 18
                 return -card.get_coin_cost()
             response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
         else:
             response.cards.append(self.agenda.forceBuy(s, player, d.card_choices))
     elif isinstance(card, Harbinger):
         def scoringFunction(card: Card):
             if has_excess_actions(p_state.hand):
                 if isinstance(card, ActionCard):
                     return 100 + card.get_coin_cost()
                 else:
                     return card.get_coin_cost()
             else:
                 return card.get_coin_cost()
         response.cards = heuristic_select_cards(d.card_choices, d.min_cards, scoringFunction)
     elif isinstance(card, Artisan):
         event = s.events[-1]
         if not event.gained_card:
             response.cards.append(self.agenda.forceBuy(s, player, d.card_choices))
         else:
             self.makePutDownOnDeckDecision(s, response)
     elif isinstance(card, Poacher):
         self.makeDiscardDownDecision(s, response)
     else:
         logging.error('Unexpected decision')
示例#28
0
def train_mcts(env: Environment,
               tree: GameTree,
               path: str,
               rollout_path: str,
               epochs: int,
               train_epochs_interval: int = 1000,
               train_epochs_cap=10000,
               save_epochs=1000,
               scoring='win_loss'):
    for epoch in tqdm(range(epochs)):
        state: State = env.reset()
        tree.reset(state)
        done = False
        expanded = False
        flip = False
        data = {
            'features': [],
            'rewards': [],
            'cards': [],
            'idxs': state.feature.idxs
        }
        data['model_name'] = os.path.split(path)[-1]
        while not done:
            action = DecisionResponse([])
            d: DecisionState = state.decision
            player: Player = env.players[d.controlling_player]

            # Add any states now visible due to randomness
            if tree.in_tree:
                cards = d.card_choices + [None]
                tree.node.add_unique_children(cards)

            player.makeDecision(state, action)

            if isinstance(player, MCTSPlayer):
                x = state.feature.to_numpy()
                data['features'].append(x)
                data['cards'].append(action.single_card)

            # Advance to the next node within the tree, implicitly adding a node the first time we exit tree
            if tree.in_tree:
                tree.advance(action.single_card)

            # First time we go out of tree, enter rollout phase
            if not expanded and not tree.in_tree:
                # Previous node is starting player action, so current node is opponent player action.
                flip = (state.player == 1)
                expanded = True

            obs, reward, done, _ = env.step(action)

        data['rewards'].extend([reward] *
                               (len(data['features']) - len(data['rewards'])))
        start_idx = 1 if flip else 0
        p0_score, p1_score = state.get_player_score(0), state.get_player_score(
            1)
        if scoring == 'score':
            p0_reward, p1_reward = p0_score, p1_score
        elif scoring == 'win_loss':
            if reward == 0:
                p0_reward, p1_reward = 1 / 2, 1 / 2
            elif reward == 1:
                p0_reward, p1_reward = 1, 0
            else:
                p0_reward, p1_reward = 0, 1
        elif scoring == 'score_ratio':
            min_score = min(p0_score, p1_score)
            if min_score < 0:
                p0_score_nonneg, p1_score_nonneg = p0_score + abs(
                    min_score), p1_score + abs(min_score)
            else:
                p0_score_nonneg, p1_score_nonneg = p0_score, p1_score
            if p0_score_nonneg == 0 and p1_score_nonneg == 0:
                p0_reward, p1_reward = 0, 0
            else:
                total_score = p0_score_nonneg + p1_score_nonneg
                p0_reward, p1_reward = p0_score / total_score, p1_score / total_score

        tree.node.backpropagate((p0_reward, p1_reward), start_idx=start_idx)

        if save_epochs > 0 and epoch % save_epochs == 0:
            save(path, tree._root)

            for player in env.players:
                if isinstance(player, MCTSPlayer):
                    player.rollout.save(rollout_path)
                    break

        # mcts players share the tree, so only update once
        for player in env.players:
            if isinstance(player, MCTSPlayer):
                player.rollout.update(**data)
                if (epoch + 1) % train_epochs_interval == 0 and (
                        epoch + 1) < train_epochs_cap:
                    player.rollout.learn()

    for player in env.players:
        if isinstance(player, MCTSPlayer):
            player.rollout.save(rollout_path)
            break
    save(path, tree._root)