def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 4 deicticShape = (3, 3, 4) num_deictic_patches = 36 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36, 9, 4)) for i in range(4): obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) state_numeric = 9 * np.ones( (4, shape[0]) ) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2] == i)[0] state_numeric[i, pos[0][idx]] = pos[1][idx] # state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model lr = 1e-3 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_cascaded( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_cascade=num_cascade, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() dimSize = deicticShape[0] * deicticShape[1] + 1 tabularQ = 1 * np.ones( (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions)) replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) # # Get current q-values: tabular version # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:] # Get current q-values: neural network version qCurr = getq(np.array(obsDeictic))[:, -1, :] # select action qCurrNoise = qCurr + np.random.random( ) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] q_resize_from_network = [ batch_size, num_deictic_patches, num_cascade, num_actions ] obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # # Get curr, next values: tabular version # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:] # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # Get curr, next values: neural network version qNext = np.reshape( getq(np.reshape(obses_tp1_deic, obs_resize_to_network)), q_resize_from_network)[:, :, -1, :] qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(np.max(qNext, 2), 1) targetsRaw = rewards + (1 - dones) * gamma * qNextmax targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) # Get qCurrActionSelect actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) qCurrActionSelect = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i] # Get targets masked for cascade level targetMask = targetsTiled < qCurrActionSelect targets = np.zeros((batch_size, num_deictic_patches, num_cascade)) targets[:, :, 0] = targetsTiled[:, :, 0] targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1] targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2] targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3] targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4] qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actionsTiled == i qCurrTargets[:, :, :, i] = myActions * targets + ( 1 - myActions) * qCurr[:, :, :, i] # # Update values: tabular version # tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \ # (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \ # + learning_alpha * targets # Update values: neural network version targets_resize_to_network = [ batch_size * num_deictic_patches, num_cascade, num_actions ] td_error_out, obses_out, targets_out = targetTrain( np.reshape(obses_t_deic, obs_resize_to_network), np.reshape(qCurrTargets, targets_resize_to_network)) td_error_pre = qCurrActionSelect - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # # tabular version # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # neural network version qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) qCurrActionSelect_post = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :, i] td_error_post = qCurrActionSelect_post - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) if -1 in rewards: dones # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) # deicticShape = (3,3,2) # deicticShape = (4,4,1) # deicticShape = (4,4,2) deicticShape = (4, 4, 3) # deicticShape = (3,3,4) num_deictic_patches = 25 # num_actions = 4 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( # convs=[(16,4,1)], # hiddens=[16], # dueling=True # ) # MLP version model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # obsDeictic, patchesTiledStacked2 = getDeic([obs]) # # CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq( np.reshape( obsDeictic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeicticObsBatch(obses_t) # obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: CNN version # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq( np.reshape( obses_tp1_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) qCurr = getq( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 # batch_size=2 train_freq = 4 # train_freq=8 deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 3 episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36, 9, 4)) for i in range(4): obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) state_numeric = 9 * np.ones( (4, shape[0]) ) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2] == i)[0] state_numeric[i, pos[0][idx]] = pos[1][idx] # state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) dimSize = deicticShape[0] * deicticShape[1] + 1 # tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ1 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ2 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ3 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ4 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) tabularQ5 = 100 * np.ones( [dimSize, dimSize, dimSize, dimSize, num_actions]) replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3], :] # select action qCurrNoise = qCurr + np.random.random( ) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) item = (obs, action, rew, new_obs, float(done)) replay_buffer.add(obs, action, rew, new_obs, float(done)) # debug if t > max_timesteps * 1.1: print("obs:\n" + str(np.squeeze(obs))) print("qCurr:\n" + str(qCurr)) print("action: " + str(action) + ", patch: " + str(selPatch)) print("close:\n" + str(obsDeictic[selPatch, :, :, 0] + obsDeictic[selPatch, :, :, 1])) print("far:\n" + str(obsDeictic[selPatch, :, :, 2] + obsDeictic[selPatch, :, :, 3])) action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) stateCurr = convertStateBatch(getDeicticObsBatch(obses_t)) stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1)) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:] qNext = tabularQ5[stateNext[:, 0, :], stateNext[:, 1, :], stateNext[:, 2, :], stateNext[:, 3, :], :] qNextmax = np.max(np.max(qNext, 2), 1) targets = rewards + (1 - dones) * gamma * qNextmax targetsTiled = np.tile(np.reshape(targets, [batch_size, 1]), [1, num_deictic_patches]) actionsTiled = np.tile(np.reshape(actions, [batch_size, 1]), [1, num_deictic_patches]) # tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = np.minimum(targetsTiled, tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled]) target2_mask = targetsTiled < tabularQ1[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] target3_mask = targetsTiled < tabularQ2[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] target4_mask = targetsTiled < tabularQ3[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] target5_mask = targetsTiled < tabularQ4[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] targets1 = targetsTiled targets2 = target2_mask * targetsTiled + ( 1 - target2_mask ) * tabularQ2[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] targets3 = target3_mask * targetsTiled + ( 1 - target3_mask ) * tabularQ3[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] targets4 = target4_mask * targetsTiled + ( 1 - target4_mask ) * tabularQ4[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] targets5 = target5_mask * targetsTiled + ( 1 - target5_mask ) * tabularQ5[stateCurr[:, 0, :], stateCurr[:, 1, :], stateCurr[:, 2, :], stateCurr[:, 3, :], actionsTiled] tabularQ1[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \ (1 - learning_alpha) * tabularQ1[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \ + learning_alpha * targets1 tabularQ2[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \ (1 - learning_alpha) * tabularQ2[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \ + learning_alpha * targets2 tabularQ3[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \ (1 - learning_alpha) * tabularQ3[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \ + learning_alpha * targets3 tabularQ4[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \ (1 - learning_alpha) * tabularQ4[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \ + learning_alpha * targets4 tabularQ5[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] = \ (1 - learning_alpha) * tabularQ5[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],actionsTiled] \ + learning_alpha * targets5 # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps=20000 buffer_size=50000 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 learning_starts=1000 gamma=.98 target_network_update_freq=500 learning_alpha = 0.2 batch_size=32 train_freq=2 deicticShape = (3,3,1) num_deictic_patches=36 num_actions = 3 episode_rewards = [0.0] # replay_buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros((obsShape[0]+2*windowLen,obsShape[1]+2*windowLen)) obsPadded[windowLen:windowLen+obsShape[0],windowLen:windowLen+obsShape[1]] = obs[:,:,0] deicticObsThis = np.zeros((windowLen,windowLen,4)) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:,:,0] = obs[i:i+windowLen,j:j+windowLen,0] == 1 # agent zoomin deicticObsThis[:,:,1] = obs[i:i+windowLen,j:j+windowLen,0] == 2 # ball zoomin patch = obsPadded[i:i+3*windowLen,j:j+3*windowLen] for k in range(1,3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])], # [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])], # [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]] deicticObs.append(deicticObsThis.copy()) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return(np.array(deicticObsBatch)) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36,9,4)) for i in range(4): obs[:,:,i] = np.reshape(observations[:,:,:,i],[36,9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) # state_numeric = 9*np.ones((4,shape[0])) # 9 indicates agent/ball does not appear at this zoom in this glance state_numeric = 9*np.ones((shape[0],4)) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2]==i)[0] # state_numeric[i,pos[0][idx]] = pos[1][idx] state_numeric[pos[0][idx],i] = pos[1][idx] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return(np.array(state_numeric_batch)) dimSize = deicticShape[0]*deicticShape[1] + 1 tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ1 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ2 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ3 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ4 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ5 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) # # do a couple of spot checks to verify that obsDeictic is correct # num2check = 17 # print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1])) # print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3])) # qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random()*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise,0)) selPatch = np.argmax(np.max(qCurrNoise,1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # env.render() # print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # replay_buffer.add(obs, action, rew, new_obs, float(done)) # if done == 1: # print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew)) # action if t > max_timesteps * 1.05: print("obs:\n" + str(np.squeeze(obs))) print("qCurr:\n" + str(qCurr)) print("action: " + str(action) + ", patch: " + str(selPatch)) print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) action # if t > learning_starts and t % train_freq == 0: # obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # obses_t = np.reshape(obs,[1,8,8,1]) # obses_tp1 = np.reshape(new_obs,[1,8,8,1]) # stateCurr = convertStateBatch(getDeicticObsBatch(obses_t)) # stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1)) # qNext = tabularQ[stateNext[:,:,0], stateNext[:,:,1], stateNext[:,:,2], stateNext[:,:,3],:] # qNextmax = np.max(np.max(qNext,2),1) # targets = rew + (1-done) * gamma * qNextmax # batch_size = 1 # targets = np.tile(np.reshape(targets,[batch_size,1]),[1,num_deictic_patches]) # tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action] = np.minimum(targets, tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action]) stateNext = convertState(getDeicticObs(new_obs)) qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] qNextmax = np.max(qNext) targets = rew + (1-done) * gamma * qNextmax tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) # # get next q-values # stateNext = convertState(getDeicticObs(new_obs)) # qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] # qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] # # perform learning update # qNextmax = np.max(qNext) # targets = rew + (1-done) * gamma * qNextmax # max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # if max_negative_td_error > 5: # max_negative_td_error # print("max_td_error: " + str(max_negative_td_error)) # print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # print("targets:\n" + str(targets)) # tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) # target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets1 = targets # targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # # tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets1 # tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets2 # tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets3 # tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets4 # tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets5 # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: # print("************************* Episode done! **************************") new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) obs = new_obs # stop at the end of training if t > max_timesteps * 1.1: # np.set_printoptions(precision=1) # np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)}) np.set_printoptions(formatter={'float_kind':lambda x: "%.1f" % x}) # qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape] # todisplay = np.c_[qCurr5,np.transpose(stateCurr)] print("obs:\n" + str(np.squeeze(obs))) # todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)] # print("q-values:\n" + str(todisplay)) # # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # print("action: " + str(action) + ", patch: " + str(selPatch)) action # print("obs:\n" + str(np.squeeze(obs))) # print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3)))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # t t
def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,4) # num_deictic_patches=36 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): # return U.BatchInput(deicticShape, name=name) return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current q-values: neural network version qCurr = getq(np.array([obs])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise, 1) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) actions = np.int32(np.reshape(actions, [ batch_size, ])) # Get curr, next values: neural network version qNext = getq(obses_tp1) qCurr = getq(obses_t) # Get targets qNextmax = np.max(qNext, 1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actions == i qCurrTargets[:, i] = myActions * targets + ( 1 - myActions) * qCurr[:, i] # Update values: neural network version td_error_out, obses_out, targets_out = targetTrain( obses_t, qCurrTargets) td_error_pre = qCurr[range(batch_size), actions] - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # neural network version qCurr = getq(obses_t) td_error_post = qCurr[range(batch_size), actions] - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs