Пример #1
0
def get_config():
    agent_names = ['agent%d' % i for i in range(1, 4)]
    model = Model(agent_names, STATE_SHAPE, METHOD, NUM_ACTIONS, GAMMA)
    exps = [
        ExpReplay(
            # model=model,
            agent_name=name,
            player=Env(agent_names),
            state_shape=STATE_SHAPE,
            num_actions=[MAX_NUM_COMBS, MAX_NUM_GROUPS],
            batch_size=BATCH_SIZE,
            memory_size=MEMORY_SIZE,
            init_memory_size=INIT_MEMORY_SIZE,
            init_exploration=1.,
            update_frequency=UPDATE_FREQ) for name in agent_names
    ]

    df = MyDataFLow(exps)

    bl_evaluators = [
        BLEvaluator(EVAL_EPISODE, agent_names[0], 2, lambda: CEnv()),
        BLEvaluator(EVAL_EPISODE, agent_names[1], 3, lambda: CEnv()),
        BLEvaluator(EVAL_EPISODE, agent_names[2], 1, lambda: CEnv())
    ]

    return AutoResumeTrainConfig(
        # always_resume=False,
        data=QueueInput(df),
        model=model,
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(RunOp(model.update_target_param, verbose=True),
                            every_k_steps=STEPS_PER_EPOCH //
                            10),  # update target network every 10k steps
            *exps,
            # ScheduledHyperParamSetter('learning_rate',
            #                           [(60, 5e-5), (100, 2e-5)]),
            *[
                ScheduledHyperParamSetter(
                    ObjAttrParam(exp, 'exploration'),
                    [(0, 1), (30, 0.5), (100, 0.3),
                     (320, 0.1)],  # 1->0.1 in the first million steps
                    interp='linear') for exp in exps
            ],
            *bl_evaluators,
            Evaluator(EVAL_EPISODE, agent_names, lambda: Env(agent_names)),
            HumanHyperParamSetter('learning_rate'),
        ],
        # session_init=ChainInit([SaverRestore('../Hierarchical_Q/train_log/DQN-9-3-LASTCARDS/model-240000', 'agent1'),
        #                        SaverRestore('./train_log/DQN-60-MA/model-355000')]),
        # starting_epoch=0,
        # session_init=SaverRestore('train_log/DQN-54-AUG-STATE/model-75000'),
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
    )
Пример #2
0
 def _build_player(self):
     return CEnv()
Пример #3
0
def get_player():
    return CEnv()
Пример #4
0
        v = self._player_scores
        try:
            mean, max = v.average, v.max
            self.trainer.monitors.put_scalar('expreplay/mean_score', mean)
            self.trainer.monitors.put_scalar('expreplay/max_score', max)
        except Exception:
            logger.exception("Cannot log training scores.")
        v.reset()


if __name__ == '__main__':

    def predictor(x):
        return [np.random.random([1, 100])]

    player = CEnv()
    E = ExpReplay(predictor_io_names=(['state', 'comb_mask'], ['Qvalue']),
                  player=CEnv(),
                  state_shape=(100, 21, 256),
                  num_actions=[100, 21],
                  batch_size=16,
                  memory_size=1e4,
                  init_memory_size=1e4,
                  init_exploration=0.,
                  update_frequency=4)
    E.predictor = predictor
    E._init_memory()
    # for k in E.get_data():
    #     pass

    # for k in E.get_data():
Пример #5
0
            print(action)
            winner, done = env.step(action)
            if done:
                for agent_name in agent_names:
                    if agent_name == winner:
                        cnt[agent_name] += 1
                        print(agent_name, ' wins')
                    else:
                        if env.get_all_agent_names().index(winner) + env.get_all_agent_names().index(agent_name) == 3:
                            cnt[agent_name] += 1
                            print(agent_name, winner, ' all wins')
    print(cnt)

    # C env usage
    # TODO
    env = CEnv()
    # for _ in range(1):
    #     env.reset()
    #     env.prepare()
    #     done = False
    #     while not done:
    #         handcards = to_char(env.get_curr_handcards())
    #
    #         chandcards = [CCard(to_value(c) - 3) for c in handcards]
    #         unseen_cards = env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]].copy() \
    #                         + env.player_cards[agent_names[(env.get_current_idx() + 2) % len(env.agent_names)]].copy()
    #         cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards]
    #         next_handcards_cnt = len(env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]])
    #
    #         last_cg = char2ccardgroup(env.get_last_outcards())
    #         caction = mcsearch(chandcards, cunseen_cards, next_handcards_cnt, last_cg, env.agent_names.index(env.curr_player), env.agent_names.index(env.controller))