cur_state = env.reset() step_counter = 0 while True: step_counter += 1 if key: env.display() action = agent.choose_action(cur_state) next_state, reward = env.move(action) agent.learn( cur_state=cur_state, action=action, reward=reward, next_state=next_state ) cur_state = next_state if reward != 0: break if reward > 0: successful_step_counter_arr.append(step_counter) elif reward < 0: failed_step_counter_arr.append(step_counter) if key: print(
from env import TicTacToeEnv from agent import QLearningAgent env = TicTacToeEnv() agent = QLearningAgent(env) for game_nr in range(1000000): if game_nr % 10000 == 0: print(game_nr) done = False s = env.reset().copy() # print('Init', s) while not done: a = agent.take_action(s) r, s_, done, _ = env.step(a) agent.learn(s, a, r, s_, done) # print(s, a, r, s_, done) s = s_.copy() V = pd.DataFrame.from_dict(agent._V, orient='index', dtype=np.float32, columns=['V']) N = pd.DataFrame.from_dict(agent._N, orient='index', dtype=np.uint32, columns=['N']) df = V.merge(N, how='left', left_index=True, right_index=True) states = pd.DataFrame(df.index.values.tolist(), index=df.index) res = states.merge(V, how='left', left_index=True, right_index=True).merge( N, how='left', left_index=True, right_index=True).reset_index(drop=True)