def get_config(): agent_names = ['agent%d' % i for i in range(1, 4)] model = Model(agent_names, STATE_SHAPE, METHOD, NUM_ACTIONS, GAMMA) exps = [ ExpReplay( # model=model, agent_name=name, player=Env(agent_names), state_shape=STATE_SHAPE, num_actions=[MAX_NUM_COMBS, MAX_NUM_GROUPS], batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, init_exploration=1., update_frequency=UPDATE_FREQ) for name in agent_names ] df = MyDataFLow(exps) bl_evaluators = [ BLEvaluator(EVAL_EPISODE, agent_names[0], 2, lambda: CEnv()), BLEvaluator(EVAL_EPISODE, agent_names[1], 3, lambda: CEnv()), BLEvaluator(EVAL_EPISODE, agent_names[2], 1, lambda: CEnv()) ] return AutoResumeTrainConfig( # always_resume=False, data=QueueInput(df), model=model, callbacks=[ ModelSaver(), PeriodicTrigger(RunOp(model.update_target_param, verbose=True), every_k_steps=STEPS_PER_EPOCH // 10), # update target network every 10k steps *exps, # ScheduledHyperParamSetter('learning_rate', # [(60, 5e-5), (100, 2e-5)]), *[ ScheduledHyperParamSetter( ObjAttrParam(exp, 'exploration'), [(0, 1), (30, 0.5), (100, 0.3), (320, 0.1)], # 1->0.1 in the first million steps interp='linear') for exp in exps ], *bl_evaluators, Evaluator(EVAL_EPISODE, agent_names, lambda: Env(agent_names)), HumanHyperParamSetter('learning_rate'), ], # session_init=ChainInit([SaverRestore('../Hierarchical_Q/train_log/DQN-9-3-LASTCARDS/model-240000', 'agent1'), # SaverRestore('./train_log/DQN-60-MA/model-355000')]), # starting_epoch=0, # session_init=SaverRestore('train_log/DQN-54-AUG-STATE/model-75000'), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
def _build_player(self): return CEnv()
def get_player(): return CEnv()
v = self._player_scores try: mean, max = v.average, v.max self.trainer.monitors.put_scalar('expreplay/mean_score', mean) self.trainer.monitors.put_scalar('expreplay/max_score', max) except Exception: logger.exception("Cannot log training scores.") v.reset() if __name__ == '__main__': def predictor(x): return [np.random.random([1, 100])] player = CEnv() E = ExpReplay(predictor_io_names=(['state', 'comb_mask'], ['Qvalue']), player=CEnv(), state_shape=(100, 21, 256), num_actions=[100, 21], batch_size=16, memory_size=1e4, init_memory_size=1e4, init_exploration=0., update_frequency=4) E.predictor = predictor E._init_memory() # for k in E.get_data(): # pass # for k in E.get_data():
print(action) winner, done = env.step(action) if done: for agent_name in agent_names: if agent_name == winner: cnt[agent_name] += 1 print(agent_name, ' wins') else: if env.get_all_agent_names().index(winner) + env.get_all_agent_names().index(agent_name) == 3: cnt[agent_name] += 1 print(agent_name, winner, ' all wins') print(cnt) # C env usage # TODO env = CEnv() # for _ in range(1): # env.reset() # env.prepare() # done = False # while not done: # handcards = to_char(env.get_curr_handcards()) # # chandcards = [CCard(to_value(c) - 3) for c in handcards] # unseen_cards = env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]].copy() \ # + env.player_cards[agent_names[(env.get_current_idx() + 2) % len(env.agent_names)]].copy() # cunseen_cards = [CCard(to_value(c) - 3) for c in unseen_cards] # next_handcards_cnt = len(env.player_cards[agent_names[(env.get_current_idx() + 1) % len(env.agent_names)]]) # # last_cg = char2ccardgroup(env.get_last_outcards()) # caction = mcsearch(chandcards, cunseen_cards, next_handcards_cnt, last_cg, env.agent_names.index(env.curr_player), env.agent_names.index(env.controller))