def initialize_buffer_with_single_test_tuple(size): grid = Grid(filename="levels/" + str(size) + "x" + str(size) + "/grid_100.txt") result = set() states = grid.generate_all_states() find = False print("Start state: ") print(grid.start_state.spaces) for state in states: q_state = QState(state) for action in range((size - 1) * 4): color = int(action / 4 + 1) direction = int(action % 4) action_tu = (color, direction) new_state, reward, terminal = q_state.step(action_tu) if terminal: sars = (q_state, action, reward, new_state, terminal) print("Old state: ") print(q_state.state.spaces) print("New state") print(new_state.state.spaces) print("Action: ", action_tu) print("Action index: ", action) print("Old state winning: ", q_state.is_winning()) print("New state winning: ", new_state.is_winning()) result.add(sars) find = True break if find: break return list(result)
def initialize_buffer_with_all_tuples(size, mlp): grid = Grid(filename="levels/" + str(size) + "x" + str(size) + "/grid_1.txt") result = set() states = grid.generate_all_states() for state in states: q_state = QState(state) # Don't store the winning state in the replay buffer. if q_state.is_winning(): print("Don't include me!") continue for action in range((size - 1) * 4): color = int(action / 4 + 1) direction = int(action % 4) action_tu = (color, direction) new_state, reward, terminal = q_state.step(action_tu) sars = (q_state, action, reward, new_state, terminal) result.add(sars) print("Initial replay buffer size: ", len(result)) return list(result)
def train(file, size, Q=dict(), gamma=0.9, num_epochs=3): print("Train ", file) print("Epochs: ", num_epochs) grid = Grid(filename=file) epsilon = 1.0 print("Generate all states!") all_states = grid.generate_all_states() state_size = len(all_states) print("All states: ", state_size) lr = 0.5 gamma = 0.9 winning_states = 0 action_size = 4 * (size-1) # number of colors is 4 in 5*5 grid iter = 0 for epoch in range(num_epochs): print("Epoch: ", epoch) for state in all_states: for action in range(action_size): if iter % 1000 == 0: print("iteration ", iter) if not state in Q: Q[state] = np.zeros((action_size)) color = int(action /4 + 1) direction = int(action % 4) action_tu = (color, direction) def get_next_tuple(): if state.is_viable_action(action_tu): new_state = state.next_state(action_tu) if new_state.is_winning(): reward = 1000000000 return new_state, reward else: flows = new_state.completed_flow_count() zeroes = new_state.num_zeroes_remaining() reward = -5 * zeroes for f in range(flows): reward += 1000 return new_state, reward else: reward = -1000000 new_state = state return new_state, reward new_state, reward = get_next_tuple() if not new_state in Q: Q[new_state] = np.zeros((action_size)) Q[state][action] = Q[state][action] + lr * (reward + gamma * np.max(Q[new_state]) - Q[state][action]) if new_state.is_winning(): print("Winning State!") winning_states += 1 iter+=1 print("Number of winning states: ", winning_states) return Q