Exemplo n.º 1
0
    def __init__(self, t_prof, chief_handle, eval_agent_cls):
        super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle,
                         eval_type="Offline_Winnings", log_conf_interval=True)

        self._args = t_prof.module_args["offline"]
        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._eval_agents = [
            eval_agent_cls(t_prof=t_prof)
            for _ in range(self._env_bldr.N_SEATS)
        ]

        self._REFERENCE_AGENT = 0
Exemplo n.º 2
0
    def __init__(self, t_prof):
        super().__init__(t_prof=t_prof)
        self._ps_handles = None
        self._la_handles = None
        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        self._SINGLE = EvalAgentDeepCFR.EVAL_MODE_SINGLE in self._t_prof.eval_modes_of_algo
        self._AVRG = EvalAgentDeepCFR.EVAL_MODE_AVRG_NET in self._t_prof.eval_modes_of_algo

        # """"""""""""""""""""""""""""
        # SD-CFR
        # """"""""""""""""""""""""""""
        if self._SINGLE:
            self._strategy_buffers = [
                StrategyBuffer(t_prof=t_prof,
                               owner=p,
                               env_bldr=self._env_bldr,
                               max_size=None,
                               device=self._t_prof.device_inference)
                for p in range(t_prof.n_seats)
            ]

            if self._t_prof.log_verbose:
                self._exp_mem_usage = self.create_experiment(
                    self._t_prof.name + " Chief_Memory_Usage")
Exemplo n.º 3
0
    def __init__(self, t_prof):
        super().__init__(t_prof=t_prof)
        self._t_prof = t_prof
        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        self._ps_handles = None
        self._la_handles = None
Exemplo n.º 4
0
    def __init__(self, t_prof, chief_handle, eval_agent_cls):
        super().__init__(t_prof=t_prof)
        self._args = t_prof.module_args["rlbr"]

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        self._chief_handle = chief_handle
        self._eval_agent_cls = eval_agent_cls
        self._eval_env_bldr = _util.get_env_builder_rlbr(t_prof=t_prof)

        self._ddqns = [None for _ in range(self._eval_env_bldr.N_SEATS)]
        self._rlbr_seat_id = None
        self._agent_seat_id = None
        self._rlbr_env_wrapper = None
        self._opponent = None
        self._buf = None
        self._br_memory_saver = None

        if t_prof.nn_type == "recurrent":
            from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN
            from PokerRL.rl.buffers.BRMemorySaverRNN import BRMemorySaverRNN

            self.CircularBufferCls = CircularBufferRNN
            self.BRMemorySaverCls = BRMemorySaverRNN
        elif t_prof.nn_type == "feedforward":
            from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT
            from PokerRL.rl.buffers.BRMemorySaverFLAT import BRMemorySaverFLAT

            self.CircularBufferCls = CircularBufferFLAT
            self.BRMemorySaverCls = BRMemorySaverFLAT

        else:
            raise ValueError(t_prof.nn_type)
Exemplo n.º 5
0
    def __init__(self, t_prof, chief_handle):
        super().__init__(t_prof=t_prof)

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._chief_handle = chief_handle

        self._device = torch.device(t_prof.device_parameter_server)
Exemplo n.º 6
0
    def __init__(self, t_prof, chief_handle, eval_agent_cls):
        super().__init__(t_prof=t_prof,
                         eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof),
                         chief_handle=chief_handle,
                         evaluator_name="Head2Head_Winnings",
                         log_conf_interval=True)

        self._args = t_prof.module_args["h2h"]
        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        assert self._env_bldr.N_SEATS == 2

        self._eval_agents = [
            eval_agent_cls(t_prof=t_prof)
            for _ in range(self._env_bldr.N_SEATS)
        ]

        self._REFERENCE_AGENT = 0
Exemplo n.º 7
0
    def __init__(self, t_prof, chief_handle, eval_agent_cls):
        super().__init__(t_prof=t_prof, eval_env_bldr=rl_util.get_env_builder(t_prof=t_prof), chief_handle=chief_handle,
                         eval_type="BR")
        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        assert self._env_bldr.N_SEATS == 2

        self._eval_agent = eval_agent_cls(t_prof=t_prof)

        self._game_trees = [
            PublicTree(env_bldr=self._env_bldr,
                       stack_size=stack_size,
                       stop_at_street=None,
                       put_out_new_round_after_limit=True,
                       is_debugging=self._t_prof.DEBUGGING)
            for stack_size in self._t_prof.eval_stack_sizes
        ]

        for gt in self._game_trees:
            gt.build_tree()
            print("Tree with stack size", gt.stack_size, "has", gt.n_nodes, "nodes out of which", gt.n_nonterm,
                  "are non-terminal.")
Exemplo n.º 8
0
    def __init__(self, t_prof, mode=None, device=None):
        """
        Args:
            t_prof (TrainingProfile):
            mode:                       Any mode your algorithm's eval agent can be evaluated in. Specify modes
                                        as class variables and pass one of them here. Can be changed later by calling
                                        .to_mode(new_mode) on this instance
            device (torch.device):      The device the eval agent shall live and act on.
        """
        self.t_prof = t_prof
        self.ray = MaybeRay(runs_distributed=t_prof.DISTRIBUTED, runs_cluster=t_prof.CLUSTER)
        self.env_bldr = rl_util.get_env_builder(t_prof=t_prof)

        self._internal_env_wrapper = self.env_bldr.get_new_wrapper(is_evaluating=True, stack_size=None)
        self._mode = mode

        if device is None:
            self.device = self.t_prof.device_inference
        else:
            self.device = device
Exemplo n.º 9
0
 def __init__(self, t_prof, br_agent, mode=None, device=None):
     super().__init__(t_prof=t_prof, mode=mode, device=device)
     
     self.tree = PublicTree(
         env_bldr=rl_util.get_env_builder(t_prof=t_prof),
         stack_size=t_prof.eval_stack_sizes[0],
         stop_at_street=None,
         put_out_new_round_after_limit=True,
         is_debugging=t_prof.DEBUGGING
     )
     self.tree.build_tree()
     self.br_agent = br_agent # agent to play best response against
     self.solve_br()
     
     self.modes = ["EVAL", "BR", "BAYESIAN"]
     if mode:
         self.mode = mode
     else:
         self.mode = "EVAL" # default is eval
         
     if self.mode == "BAYESIAN":
         self._fill_tree_w_prior()
Exemplo n.º 10
0
    def __init__(self, t_prof, worker_id, chief_handle):
        super().__init__(t_prof=t_prof)

        self._adv_args = t_prof.module_args["adv_training"]

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._id = worker_id
        self._chief_handle = chief_handle

        self._adv_buffers = [
            AdvReservoirBuffer(
                owner=p,
                env_bldr=self._env_bldr,
                max_size=self._adv_args.max_buffer_size,
                nn_type=t_prof.nn_type,
                iter_weighting_exponent=self._t_prof.iter_weighting_exponent)
            for p in range(self._t_prof.n_seats)
        ]

        self._adv_wrappers = [
            AdvWrapper(owner=p,
                       env_bldr=self._env_bldr,
                       adv_training_args=self._adv_args,
                       device=self._adv_args.device_training)
            for p in range(self._t_prof.n_seats)
        ]

        self._AVRG = EvalAgentDeepCFR.EVAL_MODE_AVRG_NET in self._t_prof.eval_modes_of_algo
        self._SINGLE = EvalAgentDeepCFR.EVAL_MODE_SINGLE in self._t_prof.eval_modes_of_algo

        # """"""""""""""""""""""""""""
        # Deep CFR
        # """"""""""""""""""""""""""""
        if self._AVRG:
            self._avrg_args = t_prof.module_args["avrg_training"]

            self._avrg_buffers = [
                AvrgReservoirBuffer(owner=p,
                                    env_bldr=self._env_bldr,
                                    max_size=self._avrg_args.max_buffer_size,
                                    nn_type=t_prof.nn_type,
                                    iter_weighting_exponent=self._t_prof.
                                    iter_weighting_exponent)
                for p in range(self._t_prof.n_seats)
            ]

            self._avrg_wrappers = [
                AvrgWrapper(owner=p,
                            env_bldr=self._env_bldr,
                            avrg_training_args=self._avrg_args,
                            device=self._avrg_args.device_training)
                for p in range(self._t_prof.n_seats)
            ]

            if self._t_prof.sampler.lower() == "mo":
                self._data_sampler = MultiOutcomeSampler(
                    env_bldr=self._env_bldr,
                    adv_buffers=self._adv_buffers,
                    avrg_buffers=self._avrg_buffers,
                    n_actions_traverser_samples=self._t_prof.
                    n_actions_traverser_samples)
            else:
                raise ValueError("Currently we don't support",
                                 self._t_prof.sampler.lower(), "sampling.")
        else:
            if self._t_prof.sampler.lower() == "mo":
                self._data_sampler = MultiOutcomeSampler(
                    env_bldr=self._env_bldr,
                    adv_buffers=self._adv_buffers,
                    avrg_buffers=None,
                    n_actions_traverser_samples=self._t_prof.
                    n_actions_traverser_samples)
            else:
                raise ValueError("Currently we don't support",
                                 self._t_prof.sampler.lower(), "sampling.")

        if self._t_prof.log_verbose:
            self._exp_mem_usage = self._ray.get(
                self._ray.remote(
                    self._chief_handle.create_experiment, self._t_prof.name +
                    "_LA" + str(worker_id) + "_Memory_Usage"))
            self._exps_adv_buffer_size = self._ray.get([
                self._ray.remote(
                    self._chief_handle.create_experiment, self._t_prof.name +
                    "_LA" + str(worker_id) + "_P" + str(p) + "_ADV_BufSize")
                for p in range(self._t_prof.n_seats)
            ])
            if self._AVRG:
                self._exps_avrg_buffer_size = self._ray.get([
                    self._ray.remote(
                        self._chief_handle.create_experiment,
                        self._t_prof.name + "_LA" + str(worker_id) + "_P" +
                        str(p) + "_AVRG_BufSize")
                    for p in range(self._t_prof.n_seats)
                ])
Exemplo n.º 11
0
    def __init__(self, t_prof, worker_id, chief_handle):
        super().__init__(t_prof=t_prof)

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._id = worker_id
        self._chief_handle = chief_handle

        self._ddqn_args = t_prof.module_args["ddqn"]
        self._avg_args = t_prof.module_args["avg"]

        if t_prof.nn_type == "recurrent":
            from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN
            from PokerRL.rl.buffers.BRMemorySaverRNN import BRMemorySaverRNN
            from NFSP.workers.la.action_buffer.ActionBufferRNN import ActionBufferRNN, AvgMemorySaverRNN

            BR_BUF_CLS = CircularBufferRNN
            BR_MEM_SAVER = BRMemorySaverRNN
            AVG_BUF_CLS = ActionBufferRNN
            AVG_MEM_SAVER = AvgMemorySaverRNN

        elif t_prof.nn_type == "feedforward":
            from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT
            from PokerRL.rl.buffers.BRMemorySaverFLAT import BRMemorySaverFLAT
            from NFSP.workers.la.action_buffer.ActionBufferFLAT import ActionBufferFLAT, AvgMemorySaverFLAT

            BR_BUF_CLS = CircularBufferFLAT  # TODO: is this wrong? Nope!
            BR_MEM_SAVER = BRMemorySaverFLAT
            AVG_BUF_CLS = ActionBufferFLAT
            AVG_MEM_SAVER = AvgMemorySaverFLAT
        else:
            raise ValueError(t_prof.nn_type)

        self._avg_bufs = [
            AVG_BUF_CLS(env_bldr=self._env_bldr,
                        max_size=self._avg_args.res_buf_size,
                        min_prob=self._avg_args.min_prob_res_buf)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._br_bufs = [
            BR_BUF_CLS(env_bldr=self._env_bldr,
                       max_size=self._ddqn_args.cir_buf_size)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._action_and_hand_buffer = ActionAndHandBufferFLAT(
            env_bldr=self._env_bldr,
            max_size=self._t_prof.action_and_hand_buffer_size)
        self._avg_memory_savers = [[
            AVG_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._avg_bufs[p])
            for _ in range(self._t_prof.n_envs)
        ] for p in range(self._env_bldr.N_SEATS)]
        self._br_memory_savers = [[
            BR_MEM_SAVER(env_bldr=self._env_bldr, buffer=self._br_bufs[p])
            for _ in range(self._t_prof.n_envs)
        ] for p in range(self._env_bldr.N_SEATS)]
        self._br_learner = [
            DDQN(owner=p, ddqn_args=self._ddqn_args, env_bldr=self._env_bldr)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._avg_learner = [
            AvgWrapper(owner=p,
                       env_bldr=self._env_bldr,
                       avg_training_args=self._avg_args)
            for p in range(self._env_bldr.N_SEATS)
        ]

        self._seat_actors = [
            SeatActor(t_prof=t_prof,
                      env_bldr=self._env_bldr,
                      seat_id=p,
                      br_memory_savers=self._br_memory_savers[p],
                      avg_buf_savers=self._avg_memory_savers[p],
                      br_learner=self._br_learner[p],
                      avg_learner=self._avg_learner[p])  #,
            #action_and_hand_buffer=self._action_and_hand_bufs[p])
            for p in range(self._env_bldr.N_SEATS)
        ]

        self._parallel_env = ParallelEnvs(t_prof=t_prof,
                                          env_bldr=self._env_bldr,
                                          n_envs=self._t_prof.n_envs)

        self._last_step_wrappers = self._parallel_env.reset()
        for p in range(self._env_bldr.N_SEATS):
            self._seat_actors[p].init([
                sw for plyr_sws in self._last_step_wrappers for sw in plyr_sws
            ])
 def __init__(self, t_prof):
     self._t_prof = t_prof
     self._env_bldr = rl_util.get_env_builder(t_prof)
     self._env_wrapper = self._env_bldr.get_new_wrapper(is_evaluating=False)
Exemplo n.º 13
0
    def __init__(self, t_prof, worker_id, chief_handle):
        super().__init__(t_prof=t_prof)

        self._env_bldr = rl_util.get_env_builder(t_prof=t_prof)
        self._id = worker_id
        self._chief_handle = chief_handle

        self._ddqn_args = t_prof.module_args["ddqn"]
        self._avg_args = t_prof.module_args["avg"]

        if t_prof.nn_type == "recurrent":
            from PokerRL.rl.buffers.CircularBufferRNN import CircularBufferRNN
            from NFSP.workers.la.action_buffer.ActionBufferRNN import ActionBufferRNN

            BR_BUF_CLS = CircularBufferRNN
            AVG_BUF_CLS = ActionBufferRNN

        elif t_prof.nn_type == "feedforward":
            from PokerRL.rl.buffers.CircularBufferFLAT import CircularBufferFLAT
            from NFSP.workers.la.action_buffer.ActionBufferFLAT import ActionBufferFLAT

            BR_BUF_CLS = CircularBufferFLAT
            AVG_BUF_CLS = ActionBufferFLAT
        else:
            raise ValueError(t_prof.nn_type)

        self._avg_buf2 = [
            AVG_BUF_CLS(env_bldr=self._env_bldr,
                        max_size=self._avg_args.res_buf_size,
                        min_prob=self._avg_args.min_prob_res_buf)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._br_buf2 = [
            BR_BUF_CLS(env_bldr=self._env_bldr,
                       max_size=self._ddqn_args.cir_buf_size)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._br_learner2 = [
            DDQN(owner=p, ddqn_args=self._ddqn_args, env_bldr=self._env_bldr)
            for p in range(self._env_bldr.N_SEATS)
        ]
        self._avg_learner2 = [
            AvgWrapper(owner=p,
                       env_bldr=self._env_bldr,
                       avg_training_args=self._avg_args)
            for p in range(self._env_bldr.N_SEATS)
        ]

        if self._t_prof.sampling == "adam":
            self._sampler = AdamSampler(
                t_prof=t_prof,
                env_bldr=self._env_bldr,
                br_buf2=self._br_buf2,
                avg_buf2=self._avg_buf2,
                br_learner2=self._br_learner2,
                avg_learner2=self._avg_learner2,
                constant_eps=self._t_prof.constant_eps_expl)

        elif self._t_prof.sampling == "clean":
            self._sampler = CleanSampler(
                t_prof=t_prof,
                env_bldr=self._env_bldr,
                br_buf2=self._br_buf2,
                avg_buf2=self._avg_buf2,
                br_learner2=self._br_learner2,
                avg_learner2=self._avg_learner2,
                constant_eps=self._t_prof.constant_eps_expl)
        else:
            self._sampler = VanillaSampler(t_prof=t_prof,
                                           env_bldr=self._env_bldr,
                                           br_buf2=self._br_buf2,
                                           avg_buf2=self._avg_buf2,
                                           br_learner2=self._br_learner2,
                                           avg_learner2=self._avg_learner2)