def main():

    env = envstandalone.MultiGhostEvade()
    #    env = envstandalone.GhostEvade()
    #    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    #    target_network_update_freq=100
    #    target_network_update_freq=10
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    obsShape = (8,8,2)
    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    #    deicticShape = (4,4,2)
    #    deicticShape = (4,4,4)
    deicticShape = (8, 8, 2)
    #    num_deictic_patches = 36
    #    num_deictic_patches = 25
    num_deictic_patches = 1

    #    num_actions = 4
    #    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #    model = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1)],
        convs=[(32, 3, 1)],
        #        convs=[(32,4,1)],
        #        convs=[(16,4,1)],
        hiddens=[16],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):
        return U.BatchInput(deicticShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq_DQN(make_obs_ph=make_obsDeic_ph,
                          q_func=q_func,
                          num_actions=num_actions)

    targetTrain = build_targetTrain_DQN(
        make_obs_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    get_2channelobs = build_get_2channelobs(make_obs_ph=make_obs_ph)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obs2channel = get_2channelobs([obs])

        # CNN version
        #        qCurr = getq(np.array([obs]))
        qCurr = getq(np.array(obs2channel))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            obses_t_deic = get_2channelobs(obses_t)
            obses_tp1_deic = get_2channelobs(obses_tp1)

            #            # Put observations in deictic form
            #            obses_t_deic = getDeic(obses_t)
            #            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
            #            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
            #
            #            # Reshape everything to (1152,) form
            #            donesTiled = np.repeat(dones,num_deictic_patches)
            #            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            #            actionsTiled = np.repeat(actions,num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)
            #            qNext = getq(obses_tp1)
            #            qCurr = getq(obses_t)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext, 1)  # standard
            #            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
            #            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            #            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax
            targets = rewards + (1 - dones) * gamma * qNextmax

            #            # Take min over targets in same group
            #            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            #            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
            #            for i in range(np.shape(uniqueCounts)[0]):
            #                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])

            #            qCurrTargets = np.copy(qCurr)
            #            qCurrTargets[:,np.int32(actions)] = targets
            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

#            # Copy into cascade with pruning.
#            qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets
#            for i in range(num_cascade-1):
#                mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled]
#                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
#                    mask*targets + \
#                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

# CNN version
            td_error_out = targetTrain(obses_t_deic, qCurrTargets)


#                    obses_t_deic,

#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

#        # Update target network periodically.
#        if t > learning_starts and t % target_network_update_freq == 0:
#            update_target()

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
예제 #2
0
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_dict = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        #        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([
            q_func_dict[x] if x in q_func_dict else 0 *
            np.ones([num_cascade, num_states]) for x in keys
        ])

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.3
        for i in range(len(keys)):
            if keys[i] in q_func_dict:
                q_func_dict[keys[i]] = (
                    1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i]
            else:
                q_func_dict[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    #    max_timesteps=80000
    #    max_timesteps=160000
    learning_starts = 1000
    #    buffer_size=50000
    buffer_size = 10000
    #    buffer_size=1000
    #    buffer_size=100
    #    buffer_size=2
    #    exploration_fraction=0.4
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    #    gamma=.98
    gamma = .96
    target_network_update_freq = 1
    #    batch_size=32
    batch_size = 64
    #    batch_size=128
    #    batch_size=256
    #    batch_size=8
    #    train_freq=1
    train_freq = 2
    #    train_freq=4
    #    train_freq=8
    #    train_freq=16
    num_train_iter = 1
    num_cpu = 16
    lr = 0.001
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (
        3, 3, 4)  # IMPORTANT: first two elts of deicticShape must be odd
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches

    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        convs=[(32, 3, 1)],
        #        convs=[(16,3,1)],
        hiddens=[32],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticActionShape)

    getq = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                      q_func=q_func,
                      num_states=num_states,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.
        #        grad_norm_clipping=0.1
    )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # obj in hand

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions

        moveDescriptors = getMoveActionDescriptors([obs[0]])
        #        actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        #        actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionsPickDescriptors = np.concatenate(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        #        # TABULAR version
        #        actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
        #        qCurr = getTabular(actionDescriptors)

        # DQN version
        qCurr = getq(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, -1, stateDeictic])  # USE CASCADE
        #        action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            for iter in range(num_train_iter):

                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)

                moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
                #                actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                #                actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                actionsPickDescriptorsNext = np.concatenate([
                    moveDescriptorsNext,
                    np.zeros(np.shape(moveDescriptorsNext))
                ],
                                                            axis=3)
                actionsPlaceDescriptorsNext = np.concatenate([
                    np.zeros(np.shape(moveDescriptorsNext)),
                    moveDescriptorsNext
                ],
                                                             axis=3)
                actionDescriptorsNextFlat = np.stack(
                    [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext],
                    axis=1)

                #            # TABULAR version
                #            actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1
                #            qNext = getTabular(actionDescriptorsNext)

                # DQN version
                actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [
                    batch_size * 2 * num_patches, deicticActionShape[0],
                    deicticActionShape[1], deicticActionShape[2]
                ]) == 1
                qNext = getq(actionDescriptorsNext)

                states_tp1Full = np.repeat(states_tp1, 2 * num_patches)

                qNextTiled = np.reshape(
                    qNext[range(2 * batch_size * num_patches), -1,
                          states_tp1Full],
                    [batch_size, 2, num_patches, -1])  # USE CASCADE
                #            qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE
                qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1)

                targets = rewards + (1 - dones) * gamma * qNextmax

                #            # TABULAR version
                #            qCurr = getTabular(actions)

                # DQN version
                qCurr = getq(actions)

                qCurrTarget = np.copy(qCurr)
                qCurrTarget[range(batch_size), 0, states_tp1] = targets
                for i in range(num_cascade - 1):
                    mask = targets < qCurr[range(batch_size), i, states_tp1]
                    qCurrTarget[range(batch_size),i+1,states_tp1] = \
                        mask*targets + \
                        (1-mask)*qCurrTarget[range(batch_size),i+1,states_tp1]

    #            # TABULAR version
    #            trainTabular(actions,qCurrTarget)

    # DQN version
                targetTrain(actions, qCurrTarget)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
예제 #3
0
def main():

    #    env = envstandalone.MultiGhostEvade()
    env = envstandalone.GhostEvade()
    #    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    #    target_network_update_freq=100
    #    target_network_update_freq=10
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    deicticShape = (3, 3, 2)
    #    deicticShape = (3,3,4)
    #    deicticShape = (4,4,2)
    #    deicticShape = (4,4,4)
    #    deicticShape = (8,8,2)
    num_deictic_patches = 36
    #    num_deictic_patches = 25
    #    num_deictic_patches = 1

    #    num_actions = 4
    #    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #    model = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1)],
        convs=[(32, 3, 1)],
        #        convs=[(32,4,1)],
        #        convs=[(16,4,1)],
        hiddens=[16],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                            q_func=q_func,
                            num_actions=num_actions,
                            num_cascade=num_cascade,
                            scope="deepq",
                            qscope="q_func_target")

    update_target = build_update_target(scope="deepq",
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func")

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    #    getqRotated = build_getqRotated(make_obsDeic_ph=make_obsDeic_ph,
    #                                    q_func=q_func,
    #                                    num_actions=num_actions,
    #                                    num_cascade=num_cascade,
    #                                    reuse=True)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])

        qCurr = getq(np.array(obsDeictic))

        #        # average Q values from all four orientations
        #        qCurrRot0 = getq(np.array(obsDeictic))
        #        qCurrRot1 = getq(np.rot90(obsDeictic,k=1,axes=(1,2)))
        #        qCurrRot2 = getq(np.rot90(obsDeictic,k=2,axes=(1,2)))
        #        qCurrRot3 = getq(np.rot90(obsDeictic,k=3,axes=(1,2)))
        #        qCurr = 0.25 * qCurrRot0 + np.roll(qCurrRot1,1,axis=2) + np.roll(qCurrRot2,2,axis=2) + np.roll(qCurrRot3,3,axis=2)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))  # USE CASCADE
        #        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # WITHOUT ROTATIONS
            #            qNextTarget = getqTarget(obses_tp1_deic)
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # WITH ROTATIONS
            #            qNextRot0 = getq(np.array(obses_tp1_deic))
            #            qNextRot1 = getq(np.rot90(obses_tp1_deic,k=1,axes=(1,2)))
            #            qNextRot2 = getq(np.rot90(obses_tp1_deic,k=2,axes=(1,2)))
            #            qNextRot3 = getq(np.rot90(obses_tp1_deic,k=3,axes=(1,2)))
            #            qNext = 0.25 * qNextRot0 + np.roll(qNextRot1,1,axis=2) + np.roll(qNextRot2,2,axis=2) + np.roll(qNextRot3,3,axis=2)
            #
            #            obses_t_deicRot1 = np.rot90(obses_t_deic,k=1,axes=(1,2))
            #            obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2))
            #            obses_t_deicRot3 = np.rot90(obses_t_deic,k=3,axes=(1,2))
            ##            obses_t_deicFull = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3]
            #            qCurrRot0 = getq(np.array(obses_t_deic))
            #            qCurrRot1 = getq(np.array(obses_t_deicRot1))
            #            qCurrRot2 = getq(np.array(obses_t_deicRot2))
            #            qCurrRot3 = getq(np.array(obses_t_deicRot3))
            #            qCurr = 0.25 * qCurrRot0 + np.roll(qCurrRot1,1,axis=2) + np.roll(qCurrRot2,2,axis=2) + np.roll(qCurrRot3,3,axis=2)
            ##            qCurrFull = np.r_[qCurrRot0, qCurrRot1, qCurrRot2, qCurrRot3]

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)  # standard
            #            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
            #            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            # Take min over targets in same group
            #            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            #            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
            #            for i in range(np.shape(uniqueCounts)[0]):
            #                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])

            # Copy into cascade with pruning -- WITHOUT ROTATIONS
            qCurrTargets = np.copy(qCurr)
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]
#            qCurrTargetsFull = np.tile(qCurrTargets,[4,1,1])

#            # Copy into cascade with pruning -- WITH ROTATIONS
#            actionsTiledFull = np.concatenate([actionsTiled, actionsTiled-1, actionsTiled-2, actionsTiled-3])
#            actionsTiledFull = actionsTiledFull + 4 * (actionsTiledFull<0)
#            targetsFull = np.repeat(targets,4)
#            qCurrTargets = np.copy(qCurrFull)
#            qCurrTargets[range(4*batch_size*num_deictic_patches),0,actionsTiledFull] = targetsFull
#            for i in range(num_cascade-1):
#                maskFull = np.repeat(targets < qCurr[range(batch_size*num_deictic_patches),i,actionsTiled],4)
#                qCurrTargets[range(4*batch_size*num_deictic_patches),i+1,actionsTiledFull] = \
#                    maskFull*targetsFull + \
#                    (1-maskFull)*qCurrTargets[range(4*batch_size*num_deictic_patches),i+1,actionsTiledFull]

            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            td_error_out, obses_deic_out, targets_out = targetTrain(obses_t_deicFull, qCurrTargets)

#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

# Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs