def do_epsilon_greedy_step(paths, epsilon, predict_model): # Get predictions (don't query the net if epsilon == 1.0) if epsilon == 1.0: p = np.random.uniform(size=16 * len(paths)).reshape((-1, 16)) else: p = get_predictions(paths, predict_model).reshape((-1, 16)) # Fill with random with probability epsilon epsilon_choice = np.random.uniform(size=len(paths)) < epsilon p[epsilon_choice] = np.random.uniform(size=16 * epsilon_choice.sum()).reshape( (-1, 16)) # Argsort to get action ranks ranked_actions = p.argsort() # For each state in path alldone = True for n, path in enumerate(paths): _, state = path[-1] if state.status != helicopter4x4.Status.flying: continue # Find highest ranked valid move state = copy.deepcopy(state) for action in ranked_actions[n, ::-1]: action = np.unravel_index(action, (4, 4)) move = helicopter4x4.Position(*action) if state.receive_Move(move) is None: break path.append((move, state)) alldone = False return alldone
def generate_random_initial(size): states = [] for nmap in generate_random_maps(size): state = helicopter4x4.State() pos = helicopter4x4.Position(0, 0) state.receive_SetState(pos, nmap, 2) states.append(state) return states
def generate_random_maps(size): r = np.random.uniform(0, 2**16, size=size) r = r.astype(np.uint32).view(np.uint8) r = np.unpackbits(r).reshape((-1, 32))[:, :16] r = np.argwhere(r.astype(bool)) data = [defaultdict(lambda: []) for _ in range(size)] for i in range(r.shape[0]): n, ix = r[i] p = helicopter4x4.Position(*np.unravel_index(ix, (4, 4))) data[n][p] = True return data
def prepare_target(self, rewards, nstates, dones, symbolic, discount): done_idx = dones > 0.0 targetQ = self.predict_model.predict(nstates) predictQ = targetQ[~done_idx].reshape((-1, 16)) idxQ = self.doubled_model.predict(nstates) idxQ = idxQ[~done_idx].reshape((-1, 16)).argsort(axis=1)[:, ::-1] maxQ = np.zeros(predictQ.shape[0]) for n, (s, qs, ixqs) in enumerate(zip(symbolic, predictQ, idxQ)): s = copy.deepcopy(s) for ixq in ixqs: action = np.unravel_index(ixq, (4, 4)) action = helicopter4x4.Position(*action) if s.receive_Move(action) is None: break maxQ[n] = qs[ixq] targetQ[done_idx] = rewards[done_idx, None, None] targetQ[~done_idx] = maxQ[:, None, None] * discount return targetQ
def random_move(): ix = int(np.random.uniform(16)) move = np.unravel_index(ix, (4, 4)) return helicopter4x4.Position(*move)