示例#1
0
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    deicticShape = (3, 3, 2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):

                #                # one-channel output
                #                deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:]

                # two channel output
                deicticObsThis = np.zeros(deicticShape)
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 10
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 20

                deicticObs.append(deicticObsThis)

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True)

    #    # MLP version
    #    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([9], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])

        # CNN version
        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,9]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,9]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,9]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)

            qCurrTargets


#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,9]),
#                    qCurrTargets
#                    )

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
示例#2
0
def main():

    #    env = envstandalone.BallCatch()
    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    #    batch_size=1
    train_freq = 1

    obsShape = (8, 8, 1)
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16, 3, 1)],
        #        convs=[(16,2,1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)
#        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    #    tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions])
    tabularQ = 0 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeicticObs(obs)

        # get q: neural network
        qCurr = getq(np.array(obsDeictic))

        #        # get q: tabular
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:
            #        if t > max_timesteps:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network)
            obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network)
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: tabular version
            #            q_resize_from_network = [batch_size*num_deictic_patches,num_actions]
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:]
            #            qNext = np.reshape(qNext,q_resize_from_network)
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:]
            #            qCurr = np.reshape(qCurr,q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(qNext, 1)
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Update values: neural network version
            qCurrTargets = np.copy(qCurr)
            qCurrTargets[range(batch_size * num_deictic_patches),
                         actionsTiled] = targets

            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            # Update values: tabular version
#            stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches])
#            tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \
#                (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \
#                + learning_alpha * targets

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
def main():

    env = envstandalone.TestRob3Env()
    
    max_timesteps=40000
    learning_starts=1000
    buffer_size=50000
#    buffer_size=1
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=1

    obsShape = (8,8,1)
#    deicticShape = (3,3,1)
    deicticShape = (3,3,2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16,3,1)],
#        convs=[(16,2,1)],
        hiddens=[16],
        dueling=True
    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])

    q_func=model
    lr=0.001
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)
        
#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func"
            )
    
    getqTarget = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func_target"
            )

    update_target = build_update_target(scope="deepq", 
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")
                      
    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func"
    )
    
    getDeic = build_getDeic(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])
        
        # CNN version
        qCurr = getq(np.array(obsDeictic))
        
#        # MLP version
#        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
#        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            
            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones,num_deictic_patches)
            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            actionsTiled = np.repeat(actions,num_deictic_patches)
            
            # Get curr, next values: CNN version
            qNextTarget = getqTarget(obses_tp1_deic)
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

#            # Get curr, next values: MLP version
#            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
#            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:,-1,:],1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
            qCurrTargets = np.copy(qCurr)
            
            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets
            for i in range(num_cascade-1):
                mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]
            
            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                    obses_t_deic,
                    qCurrTargets
                    )
            
#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )
                
        # Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    buffer_size = 50000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 64
    train_freq = 2

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    tabularQ = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    obs = env.reset()

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)
        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2],
                         stateCurr[3], :]

        # select action
        action = np.argmax(np.max(qCurr, 0))
        selPatch = np.argmax(np.max(qCurr, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # get next q-values
        stateNext = convertState(getDeicticObs(new_obs))
        qNext1 = tabularQ[stateNext[0], stateNext[1], stateNext[2],
                          stateNext[3], :]

        # perform learning update
        qNextmaxa = np.max(
            qNext1,
            1)  # this deictic max seems to work better on this problem. why?
        #        qNextmaxa = np.max(qNext1) # this is the correct deictic max
        targets = rew + (1 - done) * gamma * qNextmaxa

        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
            (1 - learning_alpha) * tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
            + learning_alpha * targets

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) +
                  ", max q at curr state: " + str(np.max(qCurr)))

        obs = new_obs
def main():

#    env = envstandalone.BallCatch()
    env = envstandalone.TestRob3Env()
    
    max_timesteps=40000
    learning_starts=1000
    buffer_size=50000
#    buffer_size=1000
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
    target_network_update_freq=500
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=1

    obsShape = (8,8,1)
    deicticShape = (3,3,1)
    num_deictic_patches=36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu=16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObs)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(16,3,1)],
        convs=[(16,2,1)],
#        convs=[(32,3,1)],
        hiddens=[16],
#        hiddens=[64],
#        dueling=True
        dueling=False
    )

    q_func=model
#    lr=1e-3
    lr=0.001
    
    def make_obs_ph(name):
#        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False
    )

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version        
        qCurr = getq(np.array([obs]))
        
        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise,1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        
#        # debug
#        if t > 5000:
#            print("obs:\n" + str(np.squeeze(obs)))
#            print("qCurr:\n" + str(qCurr))
#            print("action: " + str(action) + ", patch: " + str(selPatch))
#            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
#            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
#            action
            
        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
            actions = np.int32(np.reshape(actions,[batch_size,]))
            
            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext,1)
            targets = rewards + (1-dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i]
            
            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                    obses_t,
                    qCurrTargets
                    )

            td_error_pre = qCurr[range(batch_size),actions] - targets
            
#            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)
            
            td_error_post = qCurr[range(batch_size),actions] - targets
#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

                
        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 50000
    buffer_size = 50000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500

    batch_size = 64
    train_freq = 2

    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    tabularQ = 100 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    obs = env.reset()
    #    OHEnc = np.identity(max_num_groups)

    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)
        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2],
                         stateCurr[3], :]

        # select action
        action = np.argmax(np.max(qCurr, 0))
        selPatch = np.argmax(np.max(qCurr, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # get next q-values
        stateNext = convertState(getDeicticObs(new_obs))
        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2],
                         stateNext[3], :]

        # perform learning update
        qNextmaxa = np.max(qNext, 1)
        targets = rew + (1 - done) * gamma * qNextmaxa

        #        max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
        #        if max_negative_td_error > 5:
        #            max_negative_td_error
        #        print("max_td_error: " + str(max_negative_td_error))
        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],
                 action] = np.minimum(
                     targets, tabularQ[stateCurr[0], stateCurr[1],
                                       stateCurr[2], stateCurr[3], action])

        #        # Store transition in the replay buffer.
        #        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) +
                  ", max q at curr state: " + str(np.max(qCurr)))


#            # stop at the end of training
#            if t > max_timesteps * 0.75:
#                np.set_printoptions(precision=1)
#                obsDeicticReshape = np.reshape(obsDeictic,[36,9])
#                todisplay = np.c_[np.max(qCurr,1), obsDeicticReshape]
#                print("q-values:\n" + str(todisplay))
#                print("obs:\n" + str(np.squeeze(obs)))
#                print("action: " + str(action) + ", patch: " + str(selPatch))
#                t

# *************************************
# *************************************
#        to do: set break point when there is a decrease in value and study that situation...
# I noticed the deitic representations are wierd when 10 and 20 are vertically separated by one empty row...
# env.step came back w/ rew=1 and done=true. that shouldn't happen!
# *************************************
# *************************************

        obs = new_obs

    t