示例#1
0
def do_epsilon_greedy_step(paths, epsilon, predict_model):

    # Get predictions (don't query the net if epsilon == 1.0)
    if epsilon == 1.0:
        p = np.random.uniform(size=16 * len(paths)).reshape((-1, 16))
    else:
        p = get_predictions(paths, predict_model).reshape((-1, 16))

        # Fill with random with probability epsilon
        epsilon_choice = np.random.uniform(size=len(paths)) < epsilon
        p[epsilon_choice] = np.random.uniform(size=16 *
                                              epsilon_choice.sum()).reshape(
                                                  (-1, 16))

    # Argsort to get action ranks
    ranked_actions = p.argsort()

    # For each state in path
    alldone = True
    for n, path in enumerate(paths):
        _, state = path[-1]
        if state.status != helicopter4x4.Status.flying:
            continue

        # Find highest ranked valid move
        state = copy.deepcopy(state)
        for action in ranked_actions[n, ::-1]:
            action = np.unravel_index(action, (4, 4))
            move = helicopter4x4.Position(*action)
            if state.receive_Move(move) is None: break

        path.append((move, state))
        alldone = False
    return alldone
示例#2
0
def generate_random_initial(size):
    states = []
    for nmap in generate_random_maps(size):
        state = helicopter4x4.State()
        pos = helicopter4x4.Position(0, 0)
        state.receive_SetState(pos, nmap, 2)
        states.append(state)
    return states
示例#3
0
def generate_random_maps(size):
    r = np.random.uniform(0, 2**16, size=size)
    r = r.astype(np.uint32).view(np.uint8)
    r = np.unpackbits(r).reshape((-1, 32))[:, :16]
    r = np.argwhere(r.astype(bool))

    data = [defaultdict(lambda: []) for _ in range(size)]
    for i in range(r.shape[0]):
        n, ix = r[i]
        p = helicopter4x4.Position(*np.unravel_index(ix, (4, 4)))
        data[n][p] = True
    return data
示例#4
0
    def prepare_target(self, rewards, nstates, dones, symbolic, discount):
        done_idx = dones > 0.0
        targetQ = self.predict_model.predict(nstates)
        predictQ = targetQ[~done_idx].reshape((-1, 16))

        idxQ = self.doubled_model.predict(nstates)
        idxQ = idxQ[~done_idx].reshape((-1, 16)).argsort(axis=1)[:, ::-1]
        maxQ = np.zeros(predictQ.shape[0])

        for n, (s, qs, ixqs) in enumerate(zip(symbolic, predictQ, idxQ)):
            s = copy.deepcopy(s)
            for ixq in ixqs:
                action = np.unravel_index(ixq, (4, 4))
                action = helicopter4x4.Position(*action)
                if s.receive_Move(action) is None:
                    break
            maxQ[n] = qs[ixq]

        targetQ[done_idx] = rewards[done_idx, None, None]
        targetQ[~done_idx] = maxQ[:, None, None] * discount
        return targetQ
示例#5
0
def random_move():
    ix = int(np.random.uniform(16))
    move = np.unravel_index(ix, (4, 4))
    return helicopter4x4.Position(*move)