def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.memory_min_num = 1000 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = PrioritizedReplayBuffer(memory_size, self.alpha) self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False
def parse_eps_schedule(eps_schedule_str): algo, extra_args = eps_schedule_str.split(':') if algo == 'linear': steps, init_p, final_p = map(float, extra_args.split(',')) steps = int(steps) return LinearSchedule(steps, initial_p=init_p, final_p=final_p) elif algo == 'const': val = extra_args.split(',')[0] return ConstantSchedule(float(val)) else: raise NotImplemented()
def init_dqn(args): """Intitialises and returns the necessary objects for Deep Q-learning: Q-network, target network, replay buffer and optimizer. """ logging.info( "Initialisaling DQN with architecture {} and optimizer {}".format( args.dqn_archi, args.optimizer_agent)) if args.dqn_archi == 'mlp': q_net = DQN(args.obs_shape, args.n_actions, args) q_target = DQN(args.obs_shape, args.n_actions, args) elif args.dqn_archi == 'cnn': q_net = CnnDQN(args.obs_shape, args.n_actions, args) q_target = CnnDQN(args.obs_shape, args.n_actions, args) if args.optimizer_agent == 'RMSProp': optimizer_agent = optim.RMSprop(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) else: assert args.optimizer_agent == 'Adam' optimizer_agent = optim.Adam(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) q_target.load_state_dict( q_net.state_dict()) # set params of q_target to be the same replay_buffer = ReplayBuffer(args.replay_buffer_size) if args.epsilon_annealing_scheme == 'linear': epsilon_schedule = LinearSchedule(schedule_timesteps=int( args.exploration_fraction * args.n_agent_steps), initial_p=args.epsilon_start, final_p=args.epsilon_stop) else: assert args.epsilon_annealing_scheme == 'exp' epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay, final_p=args.epsilon_stop, initial_p=args.epsilon_start) return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
def learn(self, total_timesteps=None, total_episodes=None, log_interval=100, ckpt_interval=100, ckpt_path=None): last_100rewards = np.zeros(100) last_100rewards[:] = np.NaN if total_timesteps and total_episodes: raise ValueError("Only one of total_timesteps or total_episodes can be specified") if ckpt_path is None: print('Checkpoint path is not provided, no intermediate models will be saved') loop_type = 'episode' if total_episodes else 'timesteps' loop_var = total_timesteps if total_timesteps is not None else total_episodes # if self.exploration_frac is None: # self.exploration = LinearSchedule(frac=self.exploration_ep, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) # else: # self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) if self.exploration_type == 'linear': self.exploration = LinearSchedule( frac=self.exploration_frac * loop_var, initial=self.exploration_initial_eps, final=self.exploration_final_eps) elif self.exploration_type == 'exponential': self.exploration = ExponentialSchedule( frac=self.exploration_frac, initial=self.exploration_initial_eps, final=self.exploration_final_eps) train = True done = False step = 0 ep_reward = 0 obs = self.env.reset() while train: if loop_type == 'episode': update_eps = self.exploration.value(self.ep_done) if loop_type == 'timesteps': update_eps = self.exploration.value(self.elapsed_steps) if np.random.random_sample() > update_eps: action, value = self.policy.predict(obs, deterministic=True) else: action, value = self.policy.predict(obs, deterministic=False) next_obs, reward, done, info = self.env.step(action) # print(step, next_obs, self.qvalues[next_obs]) # argmax_a = np.argmax(self.qvalues[next_obs]) # argmax_a, _ = self.policy.predict(obs, deterministic=True) argmax_a = np.argmax(self.qvalues[next_obs]) if isinstance(self.observation_space, Tuple): # print(obs, action) expected_reward = reward + self.gamma*self.qvalues[next_obs + (argmax_a,)]*(1-int(done))-self.qvalues[obs + (action,)] self.qvalues[obs + (action,)] += self.learning_rate * expected_reward if self.policy.intent: intent_update = np.zeros(self.qvalues.shape) intent_update[obs + (action,)] += 1 expected_intent = intent_update + self.gamma * self.hvalues[next_obs + (argmax_a,)] * (1-int(done)) - self.hvalues[obs + (action,)] self.hvalues[obs + (action,)] = self.hvalues[obs + (action,)] + self.learning_rate * expected_intent if isinstance(self.observation_space, Discrete): expected_reward = reward + self.gamma*np.max(self.qvalues[next_obs])*(1-int(done))-self.qvalues[obs, action] self.qvalues[obs, action] += self.learning_rate * expected_reward if self.policy.intent: intent_update = np.zeros(self.qvalues.shape) intent_update[obs, action] += 1 expected_intent = intent_update + self.gamma * self.hvalues[next_obs, argmax_a] * (1-int(done)) - self.hvalues[obs, action] self.hvalues[obs, action] = self.hvalues[obs, action] + self.learning_rate * expected_intent obs = next_obs step += 1 ep_reward += reward self.elapsed_steps += 1 if loop_type == 'timesteps': if self.elapsed_steps == total_timesteps: train = False if done: # print(step) last_100rewards[self.ep_done%100] = ep_reward print("\rEpisode {}/{}, Average Reward {}".format(self.ep_done,total_episodes, np.nanmean(last_100rewards)),end="") self.ep_done += 1 step = 0 ep_reward = 0 obs = self.env.reset() if loop_type == 'episode': if self.ep_done >= total_episodes: train = False if ckpt_path is not None and ckpt_interval: if loop_type == 'episode': if self.ep_done % ckpt_interval == 0 and done: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str # super(DBNModel, self).save(full_path) super(QTabularRLModel, self).save(full_path) if loop_type == 'timesteps': if self.elapsed_steps % ckpt_interval == 0 and done: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str # super(DBNModel, self).save(full_path) super(QTabularRLModel, self).save(full_path)
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.5 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 1000 batch_size = 32 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=exploration_final_eps, # final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding", reuse=reuseModels) getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding", reuse=reuseModels) targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1., reuse=reuseModels) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # Get qCurr. I split up pick and place in order to accomodate larger batches qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors) qCurrHoldingPick = getqHolding(actionsPickDescriptors) qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors) qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors) qCurr = np.concatenate([ np.r_[qCurrNotHoldingPick, qCurrNotHoldingPlace], np.r_[qCurrHoldingPick, qCurrHoldingPlace] ], axis=1) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr[:, obs[1]]) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # Select e-greedy action to execute qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise[:, obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # Execute action new_obs, rew, done, _ = env.step(action) replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptors[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHolding( actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # print("time to do training: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize]))) plt.subplot(1, 3, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3]), vmin=5, vmax=12) plt.subplot(1, 3, 2) plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(1, 3, 3) plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.show()
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) # Define environment env = envstandalone.BlockArrange() # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) # return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys]) return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys]) def trainTabular(vectorKey,qCurrTargets,weights): keys = getTabularKeys(vectorKey) alpha=0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Standard DQN parameters # max_timesteps=20000 max_timesteps=30000 # max_timesteps=2000 learning_starts=1000 # learning_starts=10 # buffer_size=50000 buffer_size=10000 # buffer_size=1000 # buffer_size=320 # buffer_size=32 # buffer_size=8 # buffer_size=1 # exploration_fraction=0.2 exploration_fraction=0.3 # exploration_final_eps=0.02 exploration_final_eps=0.1 print_freq=1 # gamma=.98 gamma=.9 target_network_update_freq=1 batch_size=32 # batch_size=1 train_freq=1 # train_freq=2 num_cpu = 16 # lr=0.001 lr=0.0003 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay=True # prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Deictic state/action parameters deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd deicticActionShape = (3,3,2) num_cascade = 5 # num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2*num_patches num_actions_discrete = 2 # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected # ******* Build tensorflow functions ******** q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( convs=[(16,3,1), (32,3,1)], hiddens=[48], # convs=[(32,3,1)], # hiddens=[32], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(deicticActionShape, name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) # return U.BatchInput([num_cascade,num_states], name=name) return U.BatchInput([num_states], name=name) def make_weight_ph(name): return U.BatchInput([num_states], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape) if valueFunctionType == 'DQN': getq = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) targetTrain = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() episode_rewards = [0.0] timerStart = time.time() U.initialize() obs = env.reset() for t in range(max_timesteps): # Get state: in range(0,env.num_blocks) stateDeictic = np.int32(obs[1]>0) # holding # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3) actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] if valueFunctionType == "TABULAR": actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) else: qCurr = getq(actionDescriptors) qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # select action at random if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:,stateDeictic]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx,stateDeictic]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # display state at the end if t > max_timesteps-200: print(str(obs[0][:,:,0])) print(str(obs[1])) print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # display state at the end if (t > max_timesteps-200) and done: print("done *********************** done") replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None states_tp1 = np.int32(states_tp1>0) moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1) moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0) moveDescriptorsNext1 = moveDescriptorsNext1*2-1 actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3) actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3) actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0) actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]]) if valueFunctionType == "TABULAR": actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1 qNextFlat1 = getTabular(actionDescriptorsNextFlat1) else: qNextFlat1 = getq(actionDescriptorsNext1) qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states]) qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1) targets1 = rewards + (1-dones) * gamma * qNextmax1 if valueFunctionType == "TABULAR": actionsFlat = np.reshape(actions,[batch_size,-1]) == 1 qCurrTarget1 = getTabular(actionsFlat) else: qCurrTarget1 = getq(actions) td_errors = qCurrTarget1[range(batch_size),states_t] - targets1 qCurrTarget1[range(batch_size),states_t] = targets1 if valueFunctionType == "TABULAR": trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR) else: targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs # display value function obs = env.reset() moveDescriptorsRaw = getMoveActionDescriptors([obs[0]]) moveDescriptors = np.int32(moveDescriptorsRaw>0) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) qPick = getq(actionsPickDescriptors) # qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) qPlace = getq(actionsPlaceDescriptors) # qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): env = envstandalone.BallCatch() max_timesteps = 20000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1000 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 4 deicticShape = (3, 3, 4) num_deictic_patches = 36 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36, 9, 4)) for i in range(4): obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) state_numeric = 9 * np.ones( (4, shape[0]) ) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2] == i)[0] state_numeric[i, pos[0][idx]] = pos[1][idx] # state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,3,1)], convs=[(16, 2, 1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model lr = 1e-3 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) def make_target_ph(name): return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_cascaded( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_cascade=num_cascade, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() # update_target() dimSize = deicticShape[0] * deicticShape[1] + 1 tabularQ = 1 * np.ones( (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions)) replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) # # Get current q-values: tabular version # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:] # Get current q-values: neural network version qCurr = getq(np.array(obsDeictic))[:, -1, :] # select action qCurrNoise = qCurr + np.random.random( ) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # # debug # if t > 5000: # print("obs:\n" + str(np.squeeze(obs))) # print("qCurr:\n" + str(qCurr)) # print("action: " + str(action) + ", patch: " + str(selPatch)) # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # action # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] q_resize_from_network = [ batch_size, num_deictic_patches, num_cascade, num_actions ] obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # # Get curr, next values: tabular version # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:] # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # Get curr, next values: neural network version qNext = np.reshape( getq(np.reshape(obses_tp1_deic, obs_resize_to_network)), q_resize_from_network)[:, :, -1, :] qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(np.max(qNext, 2), 1) targetsRaw = rewards + (1 - dones) * gamma * qNextmax targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) # Get qCurrActionSelect actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]), [1, num_deictic_patches, num_cascade]) qCurrActionSelect = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i] # Get targets masked for cascade level targetMask = targetsTiled < qCurrActionSelect targets = np.zeros((batch_size, num_deictic_patches, num_cascade)) targets[:, :, 0] = targetsTiled[:, :, 0] targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1] targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2] targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3] targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + ( 1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4] qCurrTargets = np.zeros(np.shape(qCurr)) for i in range(num_actions): myActions = actionsTiled == i qCurrTargets[:, :, :, i] = myActions * targets + ( 1 - myActions) * qCurr[:, :, :, i] # # Update values: tabular version # tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \ # (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \ # + learning_alpha * targets # Update values: neural network version targets_resize_to_network = [ batch_size * num_deictic_patches, num_cascade, num_actions ] td_error_out, obses_out, targets_out = targetTrain( np.reshape(obses_t_deic, obs_resize_to_network), np.reshape(qCurrTargets, targets_resize_to_network)) td_error_pre = qCurrActionSelect - targets # print("td error pre-update: " + str(np.linalg.norm(td_error_pre))) # # tabular version # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:] # neural network version qCurr = np.reshape( getq(np.reshape(obses_t_deic, obs_resize_to_network)), q_resize_from_network) qCurrActionSelect_post = np.zeros( (batch_size, num_deictic_patches, num_cascade)) for i in range(num_actions): qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :, i] td_error_post = qCurrActionSelect_post - targets # print("td error post-update: " + str(np.linalg.norm(td_error_post))) if -1 in rewards: dones # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps, vispolicy): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set stride parameters for this problem instance. # Most of the time, these two stride parameters will be equal. However, # one might use a smaller stride for initial placement and a larger stride # for action specification in order to speed things up. Unfortunately, this # could cause the problem to be infeasible: no grasp might work for a given # initial setup. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.5 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 1000 # batch_size=32 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # useHierarchy = False useHierarchy = True # Set parameters related to shape of the patch and the number of patches descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (15,15,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches * env.num_orientations # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False # prioritized_replay=True prioritized_replay = False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_rot", reuse=reuseModels) getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_rot", reuse=reuseModels) targetTrainNotHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_rot", grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_rot", grad_norm_clipping=1., reuse=reuseModels) getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_norot", reuse=reuseModels) getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_norot", reuse=reuseModels) targetTrainNotHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_norot", grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_norot", grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Use hierarchy to get candidate actions if useHierarchy: # Get NoRot descriptors moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptorsNoRot = np.stack([ moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot)) ], axis=3) actionsPlaceDescriptorsNoRot = np.stack([ np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot ], axis=3) actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot, actionsPlaceDescriptorsNoRot] # Get NoRot values if obs[1] == 0: qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot) qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot) elif obs[1] == 1: qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot) qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot) else: print("error: state out of bounds") qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace]) # Get Rot actions corresponding to top k% NoRot actions k = 0.2 # top k% of NoRot actions valsNoRot = qCurrNoRot topKactionsNoRot = np.argsort( valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):] topKpositionsNoRot = topKactionsNoRot % env.num_moves topKpickplaceNoRot = topKactionsNoRot / env.num_moves actionsCandidates = [] for ii in range(2): eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii] for jj in range(env.num_orientations): actionsCandidates = np.r_[ actionsCandidates, eltsPos + jj * env.num_moves + ii * (env.num_moves * env.num_orientations)] actionsCandidates = np.int32(actionsCandidates) # No hierarchy else: actionsCandidates = range(2 * env.num_moves * env.num_orientations) # Get Rot descriptors moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]]) moveDescriptorsRot = moveDescriptorsRot * 2 - 1 actionsPickDescriptorsRot = np.stack( [moveDescriptorsRot, np.zeros(np.shape(moveDescriptorsRot))], axis=3) actionsPlaceDescriptorsRot = np.stack( [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot], axis=3) actionDescriptorsRot = np.r_[actionsPickDescriptorsRot, actionsPlaceDescriptorsRot] # Get qCurr using actionCandidates actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates] if obs[1] == 0: qCurrReduced = np.squeeze( getqNotHoldingRot(actionDescriptorsRotReduced)) elif obs[1] == 1: qCurrReduced = np.squeeze( getqHoldingRot(actionDescriptorsRotReduced)) else: print("error: state out of bounds") qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0]) qCurr[actionsCandidates] = np.copy(qCurrReduced) # # Get qCurr. I split up pick and place in order to accomodate larger batches # if obs[1] == 0: # qCurrPick = getqNotHoldingRot(actionsPickDescriptorsRot) # qCurrPlace = getqNotHoldingRot(actionsPlaceDescriptorsRot) # elif obs[1] == 1: # qCurrPick = getqHoldingRot(actionsPickDescriptorsRot) # qCurrPlace = getqHoldingRot(actionsPlaceDescriptorsRot) # else: # print("error: state out of bounds") # qCurr = np.squeeze(np.r_[qCurrPick,qCurrPlace]) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # # Select e-greedy action to execute # qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # action = np.argmax(qCurrNoise) # if (np.random.rand() < exploration.value(t)) and not vispolicy: # action = np.random.randint(num_actions) # e-greedy + softmax # qCurrExp = np.exp(qCurr/0.3) qCurrExp = np.exp(qCurr / 0.2) # qCurrExp = np.exp(qCurr/0.1) probs = qCurrExp / np.sum(qCurrExp) action = np.random.choice(range(np.size(probs)), p=probs) if (np.random.rand() < exploration.value(t)) and not vispolicy: action = np.random.randint(num_actions) position = action % env.num_moves pickplace = action / (env.num_moves * env.num_orientations) # orientation = action / env.num_moves orientation = (action - pickplace * env.num_moves * env.num_orientations) / env.num_moves actionNoRot = position + pickplace * env.num_moves if vispolicy: print("action: " + str(action)) print("position: " + str(position)) print("pickplace: " + str(pickplace)) print("orientation: " + str(orientation)) vposition = env.moveCenters[position / len(env.moveCenters)] hposition = env.moveCenters[position % len(env.moveCenters)] plt.subplot(1, 2, 1) im = env.state[0][:, :, 0] im[vposition, hposition] = 0.5 plt.imshow(env.state[0][:, :, 0]) # plt.show() # Execute action new_obs, rew, done, _ = env.step(action) if useHierarchy: # store both NoRot and Rot descriptors replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsNoRot[actionNoRot, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) else: # store only Rot descriptor replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsRot[action, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) if vispolicy: print("rew: " + str(rew)) print("done: " + str(done)) plt.subplot(1, 2, 2) plt.imshow(env.state[0][:, :, 0]) plt.show() if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot) qCurrTargetHolding = getqHoldingRot(actionPatchesRot) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Only train NoRot if we're doing the hierarchy if useHierarchy: # qCurrTargetNotHoldingNoRot = getqNotHoldingNoRot(actionPatchesNoRot) # qCurrTargetHoldingNoRot = getqHoldingNoRot(actionPatchesNoRot) # qCurrTargetNoRot = np.concatenate([qCurrTargetNotHoldingNoRot,qCurrTargetHoldingNoRot],axis=1) # idx = np.nonzero(np.int32(qCurrTargetNoRot[range(batch_size),states_t] > targets)) # targets[idx] = qCurrTargetNoRot[idx,states_t[idx]] targetTrainNotHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % exploration factor: " + str(int(100*explorationGaussianFactor.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # display value function obs = env.reset() moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors) qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors) qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot], axis=1) qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors) qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors) qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot], axis=1) moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors) qPickHolding = getqHoldingRot(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors) qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) gridSize = len(env.moveCenters) print("Value function for pick action in hold-0 state:") print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot0 in hold-0 state:") print(str(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot1 in hold-0 state:") print( str( np.reshape(qPick[gridSize**2:2 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot2 in hold-0 state:") print( str( np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for pick action for rot3 in hold-0 state:") print( str( np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot0 in hold-1 state:") print(str(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot1 in hold-1 state:") print( str( np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot2 in hold-1 state:") print( str( np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1], [gridSize, gridSize]))) print("Value function for place action for rot3 in hold-1 state:") print( str( np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1], [gridSize, gridSize]))) plt.subplot(2, 10, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3]), interpolation=None) plt.subplot(2, 10, 2) plt.imshow(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 3) plt.imshow(np.reshape(qPick[gridSize**2:2 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 4) plt.imshow(np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 5) plt.imshow(np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 6) plt.imshow(np.reshape(qPick[4 * gridSize**2:5 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 7) plt.imshow(np.reshape(qPick[5 * gridSize**2:6 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 8) plt.imshow(np.reshape(qPick[6 * gridSize**2:7 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 9) plt.imshow(np.reshape(qPick[7 * gridSize**2:8 * gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 10) plt.imshow(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 12) plt.imshow(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 13) plt.imshow(np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 14) plt.imshow(np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 15) plt.imshow(np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 16) plt.imshow(np.reshape(qPlace[4 * gridSize**2:5 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 17) plt.imshow(np.reshape(qPlace[5 * gridSize**2:6 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 18) plt.imshow(np.reshape(qPlace[6 * gridSize**2:7 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 19) plt.imshow(np.reshape(qPlace[7 * gridSize**2:8 * gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.subplot(2, 10, 20) plt.imshow(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]), vmin=5, vmax=12) plt.show()
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() # max_timesteps=40000 max_timesteps=80000 learning_starts=1000 buffer_size=50000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 batch_size=32 # batch_size=64 # batch_size=1024 train_freq=1 # obsShape = (8,8,1) obsShape = env.observation_space.shape # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 num_cascade = 5 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Dictionary-based value function q_func = {} def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def getTabularKeys(obsDeictic): obsDeicticTiled = np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) obsBits = np.packbits(obsDeicticTiled,1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the type cast below (UINT64) must be large enough to support the size of obsBits # if it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i]) return obsKeys def getTabular(obsDeictic): keys = getTabularKeys(obsDeictic) return np.array([q_func[x] if x in q_func else 1000*np.ones([num_cascade,num_actions]) for x in keys]) def trainTabular(obsDeictic,qCurrTargets): keys = getTabularKeys(obsDeictic) alpha=0.5 for i in range(len(keys)): if keys[i] in q_func: q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func[keys[i]] = qCurrTargets[i] sess = U.make_session(num_cpu) sess.__enter__() getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # Get current obervations obsDeictic = getDeic([obs]) qCurr = getTabular(obsDeictic) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) # Get next obervations obsNextDeictic = getDeic([new_obs]) qNext = getTabular(obsNextDeictic) # Calculate TD target qNextmax = np.max(qNext[:,-1,:],1) # USE CASCADE targets = rew + (1-done) * gamma * qNextmax # Update dictionary value function qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. qCurrTargets[:,0,action] = targets for i in range(num_cascade-1): mask = targets < qCurr[:,i,action] qCurrTargets[:,i+1,action] = \ mask*targets + \ (1-mask)*qCurr[:,i+1,action] # qCurrTargets[:,action] = np.minimum(targets,qCurrTargets[:,action]) trainTabular(obsDeictic,qCurrTargets) if t > 3000: obsDeictic # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def learn(self, total_timesteps=None, total_episodes=None, log_interval=100, ckpt_interval=100, ckpt_path=None): def _sample_episode(): sample = [] obs = self.env.reset() done = False while not done: update_eps = self.exploration.value(self.ep_done) if np.random.random_sample() > update_eps: action, value = self.policy.predict(obs, deterministic=True) else: action, value = self.policy.predict(obs, deterministic=False) new_obs, reward, done, info = self.env.step(action) sample.append((obs, action, reward)) obs = new_obs return sample last_100rewards = np.zeros(100) last_100rewards[:] = np.NaN episode_rewards = [] episode_successes = [] loop_var = total_timesteps if total_timesteps is not None else total_episodes loop_type = 'episode' if total_episodes else 'timesteps' if total_timesteps is not None: raise ValueError( 'Only total_episodes can be specified for this class') # if self.exploration_frac is None: # self.exploration = LinearSchedule(frac=self.exploration_ep, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) # else: # self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) if self.exploration_type == 'linear': self.exploration = LinearSchedule( frac=self.exploration_frac * loop_var, initial=self.exploration_initial_eps, final=self.exploration_final_eps) elif self.exploration_type == 'exponential': self.exploration = ExponentialSchedule( frac=self.exploration_frac, initial=self.exploration_initial_eps, final=self.exploration_final_eps) train = True while train: sample = _sample_episode() obses, actions, rewards = zip(*sample) for idx in range(len(sample)): self.elapsed_steps += 1 discounts = np.array( [self.gamma**i for i in range(len(obses) + 1)]) expected_reward = sum( rewards[idx:] * discounts[:-(1 + idx)]) - self.qvalues[obses[idx] + (actions[idx], )] self.qvalues[obses[idx] + ( actions[idx], )] += self.learning_rate * expected_reward # print(np.where(self.qvalues!=0)) if self.policy.intent: h_update = np.zeros(self.qvalues.shape) intent_update = np.zeros(len(BLACKJACK_OUTCOMES)) for iidx, (obs, action, reward) in enumerate( zip(obses[idx:], actions[idx:], rewards[idx:])): h_update[obs + ( action, )] += self.learning_rate * discounts[iidx] outcome = BLACKJACK_OUTCOMES[int(action), int(reward)] intent_update[ outcome] += self.learning_rate * discounts[iidx] mc_h = self.hvalues[obses[idx] + (actions[idx], )] * ( 1 - self.learning_rate) mc_h += h_update # print(obses[idx], actions[idx]) mc_intent = self.intention[obses[idx] + (actions[idx], )] * ( 1 - self.learning_rate) mc_intent += intent_update self.hvalues[obses[idx] + (actions[idx], )] = mc_h self.intention[obses[idx] + (actions[idx], )] = mc_intent self.ep_done += 1 last_100rewards[self.ep_done % 100] = np.sum(rewards) print("\rEpisode {}/{}".format(self.ep_done, total_episodes, np.mean(last_100rewards)), end="") # print(len(sample)) if self.ep_done >= total_episodes: train = False if ckpt_path is not None and ckpt_interval: if loop_type == 'episode': if self.ep_done % ckpt_interval == 0: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str super(BlackjackMCTabularRLModel, self).save(full_path) if loop_type == 'timesteps': if self.elapsed_steps % ckpt_interval == 0: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str super(BlackjackMCTabularRLModel, self).save(full_path)
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # Dictionary-based value function q_func_tabular = {} defaultQValue = np.ones(env.action_space.n) # Given an integer, return the corresponding boolean array def getBoolBits(state): return np.unpackbits(np.uint8(state), axis=1) == 1 # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else defaultQValue for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.1 for i in range(len(keys)): if keys[i] in q_func_tabular: q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[ keys[i]] + alpha * qCurrTargets[i] # q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] max_timesteps = 200000 exploration_fraction = 0.3 exploration_final_eps = 0.02 print_freq = 1 gamma = .98 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 100 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 valueFunctionType = "TABULAR" # valueFunctionType = "DQN" episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Set up replay buffer prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) sess = U.make_session(num_cpu) sess.__enter__() state = env.reset() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1) qCurr = getTabular(getBoolBits([[state]])) qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # select action at random action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action nextState, rew, done, _ = env.step(action) replay_buffer.add(state, action, rew, nextState, float(done)) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actions, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None qNext = getTabular( getBoolBits(np.reshape(states_tp1, [batch_size, 1]))) qNextmax = np.max(qNext, axis=1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrTarget = getTabular( getBoolBits(np.reshape(states_t, [batch_size, 1]))) td_error = qCurrTarget[range(batch_size), actions] - targets qCurrTarget[range(batch_size), actions] = targets trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])), qCurrTarget) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal state = np.copy(nextState)
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else 10 * np.ones(num_states) for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets, weights): keys = getTabularKeys(vectorKey) alpha = 0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: # q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i] q_func_tabular[ keys[i]] = q_func_tabular[keys[i]] + alpha * weights[i] * ( qCurrTargets[i] - q_func_tabular[keys[i]] ) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] # Return a list of actions in adjacent patches to <action> def getAdjacentActions(action): side = len(env.moveCenters) mat = np.reshape(range(side**2), [side, side]) move = action if action >= side**2: move = action - side**2 coords = np.squeeze(np.nonzero(mat == move)) adjacent = [] # this cell adjacent.append(coords) # 8-neighborhood adjacent.append(coords - [0, 1]) adjacent.append(coords + [0, 1]) adjacent.append(coords - [1, 0]) adjacent.append(coords + [1, 0]) adjacent.append(coords + [-1, -1]) adjacent.append(coords + [1, -1]) adjacent.append(coords + [-1, 1]) adjacent.append(coords + [1, 1]) # 16-neighborhood adjacent.append(coords + [-2, 2]) adjacent.append(coords + [-1, 2]) adjacent.append(coords + [0, 2]) adjacent.append(coords + [1, 2]) adjacent.append(coords + [2, 2]) adjacent.append(coords + [2, 1]) adjacent.append(coords + [2, 0]) adjacent.append(coords + [2, -1]) adjacent.append(coords + [2, -2]) adjacent.append(coords + [1, -2]) adjacent.append(coords + [0, -2]) adjacent.append(coords + [-1, -2]) adjacent.append(coords + [-2, -2]) adjacent.append(coords + [-2, -1]) adjacent.append(coords + [-2, 0]) adjacent.append(coords + [-2, 1]) adjacentValid = [x for x in adjacent if all(x < side) and all(x >= 0)] if action >= side**2: return [side**2 + x[0] * side + x[1] for x in adjacentValid] else: return [x[0] * side + x[1] for x in adjacentValid] env = envstandalone.NumbersArrange() # Standard q-learning parameters max_timesteps = 2000 exploration_fraction = 0.3 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 1000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # first two elts of deicticShape must be odd descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2) # descriptorShapeSmall = (10,10,2) # descriptorShapeSmall = (14,14,2) descriptorShapeSmall = (20, 20, 2) num_states = 2 # either holding or not num_patches = len(env.moveCenters)**2 num_actions = 2 * num_patches num_actions_discrete = 2 num_patches_side = len(env.moveCenters) # valueFunctionType = "TABULAR" valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay = False # prioritized_replay_alpha=1.0 prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None # prioritized_replay_beta_iters=20000 prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(16, 3, 1)], hiddens=[32], # convs=[(32,3,1)], # hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) if valueFunctionType == 'DQN': getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding") getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding") targetTrainNotHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding", grad_norm_clipping=1.) targetTrainHolding = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding", grad_norm_clipping=1.) getqNotHoldingCoarse = build_getq( make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_coarse") getqHoldingCoarse = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_states=num_states, num_cascade=5, scope="deepq", qscope="q_func_holding_coarse") targetTrainNotHoldingCoarse = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, # optimizer=tf.train.AdamOptimizer(learning_rate=lr*20), optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_coarse", grad_norm_clipping=None) targetTrainHoldingCoarse = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_states=num_states, num_cascade=5, # optimizer=tf.train.AdamOptimizer(learning_rate=lr*20), optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_coarse", grad_norm_clipping=None) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] newEpisode = 0 td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1) # select action at random qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:, obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _, idx, inv = np.unique(actionDescriptors, axis=0, return_index=True, return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx, obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv == actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") adjacentActions = getAdjacentActions(action) # take action new_obs, rew, done, _ = env.step(action) # replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done)) replay_buffer.add(obs[1], actionDescriptors[action, :], actionDescriptors[adjacentActions, :], np.copy(rew), np.copy(new_obs), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(images_tp1) moveDescriptorsNext = moveDescriptorsNext * 2 - 1 actionsPickDescriptorsNext = np.stack( [moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))], axis=3) actionsPlaceDescriptorsNext = np.stack( [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext], axis=3) actionDescriptorsNext = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1 ) # I sometimes get this axis parameter wrong... pay attention! # flat estimate of qNextmax actionDescriptorsNext = np.reshape(actionDescriptorsNext, [ batch_size * num_patches * num_actions_discrete, descriptorShapeSmall[0], descriptorShapeSmall[1], descriptorShapeSmall[2] ]) qNextNotHolding = getqNotHolding(actionDescriptorsNext) qNextHolding = getqHolding(actionDescriptorsNext) qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1) qNext = np.reshape( qNextFlat, [batch_size, num_patches, num_actions_discrete, num_states]) qNextmax = np.max( np.max(qNext[range(batch_size), :, :, states_tp1], 2), 1) # # coarse/fine estimate of qNextmax # actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size,num_patches_side,num_patches_side,num_actions_discrete,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) # aa = actionDescriptorsNext[:,range(0,num_patches_side,2),:,:,:,:,:] # bb = aa[:,:,range(0,num_patches_side,2),:,:,:,:] # cc = np.reshape(bb,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]]) # qNextNotHolding = getqNotHoldingCoarse(cc) # qNextHolding = getqHoldingCoarse(cc) # qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1) # qNext = np.reshape(qNextFlat,[batch_size,-1,num_actions_discrete,num_states]) # qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1) targets = rewards + (1 - dones) * gamma * qNextmax # train action Patches qCurrTargetNotHolding = getqNotHolding(actionPatches) qCurrTargetHolding = getqHolding(actionPatches) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets targetTrainNotHolding( actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew newEpisode = 0 if done: newEpisode = 1 new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # Train coarse grid if newEpisode: moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # actionDescriptors, inverseIdx = np.unique(actionDescriptors,axis=0,return_inverse=True) # reduce to just unique descriptors qCurrNotHolding = getqNotHolding(actionDescriptors) qCurrHolding = getqHolding(actionDescriptors) qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding)) qTargetHolding = np.zeros(np.shape(qCurrHolding)) for jj in range(num_actions): adj = getAdjacentActions(jj) qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj]) qTargetHolding[jj] = np.max(qCurrHolding[adj]) for iter in range(10): targetTrainNotHoldingCoarse( actionDescriptors, np.reshape(qTargetNotHolding, [-1, 1]), np.ones([num_actions, 1])) targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding, [-1, 1]), np.ones([num_actions, 1])) # # Train coarse grid # for iter in range(500): # print(str(iter)) # obs = env.reset() # moveDescriptors = getMoveActionDescriptors([obs[0]]) # moveDescriptors = moveDescriptors*2-1 # actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) # actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) # actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors] # qCurrNotHolding = getqNotHolding(actionDescriptors) # qCurrHolding = getqHolding(actionDescriptors) # qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding)) # qTargetHolding = np.zeros(np.shape(qCurrHolding)) # for jj in range(num_actions): # adj = getAdjacentActions(jj) # qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj]) # qTargetHolding[jj] = np.max(qCurrHolding[adj]) # targetTrainNotHoldingCoarse(actionDescriptors, np.reshape(qTargetNotHolding,[-1,1]), np.ones([num_actions,1])) # targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding,[-1,1]), np.ones([num_actions,1])) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0])) actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [gridSize, gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:, 1], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize]))) qPickNotHolding = getqNotHoldingCoarse(actionsPickDescriptors) qPickHolding = getqHoldingCoarse(actionsPickDescriptors) qPickCoarse = np.concatenate([qPickNotHolding, qPickHolding], axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPickCoarse[:, 1], [gridSize, gridSize]))) qPlaceNotHolding = getqNotHoldingCoarse(actionsPlaceDescriptors) qPlaceHolding = getqHoldingCoarse(actionsPlaceDescriptors) qPlaceCoarse = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlaceCoarse[:, 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize]))) plt.subplot(2, 3, 1) plt.imshow(np.tile(env.state[0], [1, 1, 3])) plt.subplot(2, 3, 2) plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize])) plt.subplot(2, 3, 3) plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize])) plt.subplot(2, 3, 5) plt.imshow(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize])) plt.subplot(2, 3, 6) plt.imshow(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize])) plt.show()
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps, vispolicy, objType, numOrientations, useRotHierarchy, useHandCodeHierarchy): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Create environment and set two stride parameters (stride-x and stride-y) # for this problem instance. Most of the time, the two stride parameters will be equal. env = envstandalone.PuckArrange() env.initStride = initEnvStride # stride for initial puck placement env.stride = envStride # stride for action specification env.blockType = objType env.num_orientations = numOrientations env.reset() # Standard q-learning parameters reuseModels = None max_timesteps = inputmaxtimesteps exploration_fraction = 0.75 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 60 buffer_size = 10000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 # SGD learning rate lr = 0.0003 # Set parameters related to shape of the patch and the number of patches descriptorShape = ( env.blockSize * 3, env.blockSize * 3, 2 ) # size of patch descriptor relative to number of "blocks" on board (each block is a 28x28 region) descriptorShapeSmall = ( 25, 25, 2 ) # size to which each patch gets resized to. Code runs faster w/ smaller sizes, but could miss detail needed to solve the problem. num_discrete_states = 2 # number of discrete states: either holding or not num_patches = len( env.moveCenters )**2 # env.moveCenters is num of patches along one side of image num_actions = num_discrete_states * num_patches * env.num_orientations # total actions = num discrete states X num non-rotated descriptor patches X num of orientations per patch location # e-greedy exploration schedule. I find that starting at e=50% helps curriculum learning "remember" what was learned in the prior run. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=0.5, final_p=exploration_final_eps) # Set parameters for prioritized replay. You can turn this off just by # setting the line below to False prioritized_replay = True # prioritized_replay=False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 # Create neural network q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)], hiddens=[64], dueling=True) # Build tensorflow functions def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_actionDeic_ph(name): return U.BatchInput(descriptorShapeSmall, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride) getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot( make_obs_ph=make_obs_ph, actionShape=descriptorShape, actionShapeSmall=descriptorShapeSmall, stride=env.stride, numOrientations=numOrientations) getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_rot", reuse=reuseModels) getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_holding_rot", reuse=reuseModels) targetTrainNotHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer( learning_rate=lr / 2.), # rotation learns slower than norot # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot scope="deepq", qscope="q_func_notholding_rot", # grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer( learning_rate=lr / 2.), # rotation learns slower than norot # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot scope="deepq", qscope="q_func_holding_rot", # grad_norm_clipping=1., reuse=reuseModels) getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_notholding_norot", reuse=reuseModels) getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, scope="deepq", qscope="q_func_holding_norot", reuse=reuseModels) targetTrainNotHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_notholding_norot", # grad_norm_clipping=1., reuse=reuseModels) targetTrainHoldingNoRot = build_targetTrain( make_actionDeic_ph=make_actionDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, num_discrete_states=num_discrete_states, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_holding_norot", # grad_norm_clipping=1., reuse=reuseModels) # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy. lrState = 0.1 V = np.zeros([ 2, ]) # Start tensorflow session sess = U.make_session(num_cpu) sess.__enter__() # Initialize things obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() # Load neural network model if one was specified. if fileIn != "None": saver = tf.train.Saver() saver.restore(sess, fileIn) fileInV = fileIn + 'V.npy' V = np.load(fileInV) # Iterate over time steps for t in range(max_timesteps): # Get NoRot descriptors. Each x-y position gets one descriptor patch in # a single orientation. Encode pick/place using a stack of two image channels. # Pick actions are denoted by the patch in channel 0 and zeros in channel 1. # Place actions have zeros in channel 0 and the patch in channel 1. # Each elt of actionDescriptorsNoRot is a pick/place action to a specific # position with orientation left unspecified. moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptorsNoRot = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptorsNoRot = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot, actionsPlaceDescriptorsNoRot] # If useHandCodeHierarchy == 1, we exclude patches that are completely zero if useHandCodeHierarchy == 1: nonZeroMoves = np.sum(np.sum(moveDescriptorsNoRot > 0, -1), -1) > 0 movesCandidates = np.nonzero(nonZeroMoves)[0] actionsCandidates = [] for jj in range(0, num_discrete_states): for ii in range(0, env.num_orientations): actionsCandidates = np.r_[actionsCandidates, movesCandidates + ii * env.num_moves + jj * env.num_orientations * env.num_moves] actionsCandidatesHandCodeHierarchy = np.int32(actionsCandidates) movesCandidatesHandCodeHierarchy = np.int32(movesCandidates) else: actionsCandidatesHandCodeHierarchy = range( num_discrete_states * env.num_moves * env.num_orientations) movesCandidatesHandCodeHierarchy = range(env.num_moves) # If useRotHierarchy == 1, we evaluate the Q function using a two-level hierarchy. # The first level (getq<Not>HoldingNoRot) is position but no rotation. # The second level (getq<Not>HoldingRot) is both position and orientation. # Specifically, we evaluate getq<Not>HoldingRot only for the top 20% of positions # found using getq<Not>HoldingNoRot. if useRotHierarchy == 1: # Get NoRot values if obs[1] == 0: qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) elif obs[1] == 1: qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot[ movesCandidatesHandCodeHierarchy]) else: print("error: state out of bounds") qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace]) qCurrNoRotIdx = np.r_[movesCandidatesHandCodeHierarchy, env.num_moves + movesCandidatesHandCodeHierarchy] # Get Rot actions corresponding to top k% NoRot actions k = 0.2 # top k% of NoRot actions # k=0.1 # DEBUG: TRYING TO VISUALIZE AND RAN OUT OF MEM ON LAPTOP... valsNoRot = qCurrNoRot topKactionsNoRot = np.argsort( valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):] topKpositionsNoRot = qCurrNoRotIdx[topKactionsNoRot] % env.num_moves topKpickplaceNoRot = qCurrNoRotIdx[topKactionsNoRot] / env.num_moves actionsCandidates = [] for ii in range(2): eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii] for jj in range(env.num_orientations): actionsCandidates = np.r_[ actionsCandidates, eltsPos + jj * env.num_moves + ii * (env.num_moves * env.num_orientations)] actionsCandidatesRotHierarchy = np.int32(actionsCandidates) # No rot hierarchy else: actionsCandidatesRotHierarchy = range( num_discrete_states * env.num_moves * env.num_orientations) # Intersect two types of hierarchy and get final list of actions to consider actionsCandidates = np.intersect1d(actionsCandidatesRotHierarchy, actionsCandidatesHandCodeHierarchy) # Get all patch descriptors (position + rotation) moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]]) moveDescriptorsRot = moveDescriptorsRot * 2 - 1 actionsPickDescriptorsRot = np.stack( [moveDescriptorsRot, np.zeros(np.shape(moveDescriptorsRot))], axis=3) actionsPlaceDescriptorsRot = np.stack( [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot], axis=3) actionDescriptorsRot = np.r_[actionsPickDescriptorsRot, actionsPlaceDescriptorsRot] # Get qCurr for selected actions, i.e. actions contained in actionCandidates actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates] if obs[1] == 0: qCurrReduced = np.squeeze( getqNotHoldingRot(actionDescriptorsRotReduced)) elif obs[1] == 1: qCurrReduced = np.squeeze( getqHoldingRot(actionDescriptorsRotReduced)) else: print("error: state out of bounds") qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0]) qCurr[actionsCandidates] = np.copy(qCurrReduced) # Update tabular state-value function using V(s) = max_a Q(s,a) thisStateValues = np.max(qCurr) V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues # # Select e-greedy action to execute # qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly # action = np.argmax(qCurrNoise) # if (np.random.rand() < exploration.value(t)) and not vispolicy: # action = np.random.randint(num_actions) # e-greedy + softmax action selection qCurrExp = np.exp(qCurr / 0.1) probs = qCurrExp / np.sum(qCurrExp) action = np.random.choice(range(np.size(probs)), p=probs) if (np.random.rand() < exploration.value(t)) and not vispolicy: action = np.random.randint(num_actions) # factor action into position, orientation, pick-or-place position = action % env.num_moves pickplace = action / (env.num_moves * env.num_orientations) orientation = (action - pickplace * env.num_moves * env.num_orientations) / env.num_moves actionNoRot = position + pickplace * env.num_moves if vispolicy: print("action: " + str(action)) print("position: " + str(position)) print("pickplace: " + str(pickplace)) print("orientation: " + str(orientation)) plt.subplot(1, 2, 1) plt.imshow(env.state[0][:, :, 0]) sp.misc.imsave('temp1.png', env.state[0][:, :, 0]) # Execute action new_obs, rew, done, _ = env.step(action) # Add to buffer replay_buffer.add(cp.copy(obs[1]), np.copy(actionDescriptorsNoRot[actionNoRot, :]), np.copy(actionDescriptorsRot[action, :]), cp.copy(rew), cp.copy(new_obs[1]), cp.copy(float(done))) # If vispolicy==True, then visualize policy if vispolicy: print("rew: " + str(rew)) print("done: " + str(done)) plt.subplot(1, 2, 2) plt.imshow(env.state[0][:, :, 0]) plt.show() sp.misc.imsave('temp2.png', env.state[0][:, :, 0]) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: beta = beta_schedule.value(t) states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample( batch_size, beta) else: states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate target targets = rewards + (1 - dones) * gamma * V[states_tp1] # Get current q-values and calculate td error and q-value targets qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot) qCurrTargetHolding = getqHoldingRot(actionPatchesRot) qCurrTarget = np.concatenate( [qCurrTargetNotHolding, qCurrTargetHolding], axis=1) td_error = qCurrTarget[range(batch_size), states_t] - targets qCurrTarget[range(batch_size), states_t] = targets # Train targetTrainNotHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingRot( actionPatchesRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainNotHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 0], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) targetTrainHoldingNoRot( actionPatchesNoRot, np.reshape(qCurrTarget[:, 1], [batch_size, 1]), np.reshape(weights, [batch_size, 1])) # Update replay priorities using td_error if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # save learning curve filename = 'PA18_deictic_rewards.dat' np.savetxt(filename, episode_rewards) # save what we learned if fileOut != "None": saver = tf.train.Saver() saver.save(sess, fileOut) fileOutV = fileOut + 'V' print("fileOutV: " + fileOutV) np.save(fileOutV, V) # Display value function from this run obs = env.reset() moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]]) moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptorsNoRot, np.zeros(np.shape(moveDescriptorsNoRot))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot], axis=3) qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors) qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors) qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot], axis=1) qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors) qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors) qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot], axis=1) moveDescriptors = getMoveActionDescriptorsRot([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors) qPickHolding = getqHoldingRot(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1) qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors) qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1) gridSize = len(env.moveCenters) print("Value function for pick action in hold-0 state:") print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]))) for ii in range(env.num_orientations): print("Value function for pick action for rot" + str(ii) + " in hold-0 state:") print( str( np.reshape( qPick[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0], [gridSize, gridSize]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]))) for ii in range(env.num_orientations): print("Value function for place action for rot" + str(ii) + " in hold-1 state:") print( str( np.reshape( qPlace[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0], [gridSize, gridSize])))
def main(): # env = envstandalone.BallCatch() env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 # batch_size=1 train_freq = 1 obsShape = (8, 8, 1) deicticShape = (3, 3, 1) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :]) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return (np.array(deicticObsBatch)) # input: batch x nxnx1 tensor of observations def convertState(observations): shape = np.shape(observations) observations_small = np.squeeze(observations) agent_pos = np.nonzero(observations_small == 10) ghost_pos = np.nonzero(observations_small == 20) state_numeric = 3 * np.ones((4, shape[0])) state_numeric[0, agent_pos[0]] = agent_pos[1] state_numeric[1, agent_pos[0]] = agent_pos[2] state_numeric[2, ghost_pos[0]] = ghost_pos[1] state_numeric[3, ghost_pos[0]] = ghost_pos[2] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return (np.array(state_numeric_batch)) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( convs=[(16, 3, 1)], # convs=[(16,2,1)], # convs=[(32,3,1)], hiddens=[16], # hiddens=[64], # dueling=True dueling=False) q_func = model # lr=1e-3 lr = 0.001 def make_obs_ph(name): return U.BatchInput(deicticShape, name=name) # return U.BatchInput(obsShape, name=name) def make_target_ph(name): return U.BatchInput([num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq, targetTrain = build_graph.build_train_nodouble( make_obs_ph=make_obs_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), grad_norm_clipping=10, double_q=False) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() # tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions]) tabularQ = 0 * np.ones([ deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1, deicticShape[1] + 1, num_actions ]) timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeicticObs(obs) # get q: neural network qCurr = getq(np.array(obsDeictic)) # # get q: tabular # stateCurr = convertState(obsDeictic) # qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise, 0)) selPatch = np.argmax(np.max(qCurrNoise, 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # if t > max_timesteps: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeicticObsBatch(obses_t) obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form obs_resize_to_network = [ batch_size * num_deictic_patches, deicticShape[0], deicticShape[1], deicticShape[2] ] obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network) obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network) donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: neural network version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: tabular version # q_resize_from_network = [batch_size*num_deictic_patches,num_actions] # stateNext = convertStateBatch(obses_tp1_deic) # qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:] # qNext = np.reshape(qNext,q_resize_from_network) # stateCurr = convertStateBatch(obses_t_deic) # qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:] # qCurr = np.reshape(qCurr,q_resize_from_network) # Get "raw" targets (no masking for cascade levels) qNextmax = np.max(qNext, 1) targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Update values: neural network version qCurrTargets = np.copy(qCurr) qCurrTargets[range(batch_size * num_deictic_patches), actionsTiled] = targets td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # # Update values: tabular version # stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches]) # tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \ # (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \ # + learning_alpha * targets # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
class Agent: def __init__(self, net, actionSet, goalSet, defaultNSample, defaultRandomPlaySteps, controllerMemCap, explorationSteps, trainFreq, hard_update, controllerEpsilon=defaultControllerEpsilon): self.actionSet = actionSet self.controllerEpsilon = controllerEpsilon self.goalSet = goalSet self.nSamples = defaultNSample self.gamma = defaultGamma self.net = net self.memory = PrioritizedReplayBuffer(controllerMemCap, alpha=prioritized_replay_alpha) self.enable_double_dqn = True self.exploration = LinearSchedule(schedule_timesteps = explorationSteps, initial_p = 1.0, final_p = 0.02) self.defaultRandomPlaySteps = defaultRandomPlaySteps self.trainFreq = trainFreq self.randomPlay = True self.learning_done = False self.hard_update = hard_update def selectMove(self, state): if not self.learning_done: if self.controllerEpsilon < random.random(): return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0)) #return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), dummyYtrue, dummyMask], verbose=0)[1]) return random.choice(self.actionSet) else: return np.argmax(self.simple_net.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0)) def setControllerEpsilon(self, epsilonArr): self.controllerEpsilon = epsilonArr def criticize(self, reachGoal, action, die, distanceReward, useSparseReward): reward = 0.0 if reachGoal: reward += 1.0 #reward += 50.0 if die: reward -= 1.0 if not useSparseReward: reward += distanceReward reward = np.minimum(reward, maxReward) reward = np.maximum(reward, minReward) return reward def store(self, experience): self.memory.add(experience.state, experience.action, experience.reward, experience.next_state, experience.done) #self.memory.add(np.abs(experience.reward), experience) def compile(self): def huber_loss(y_true, y_pred, clip_value): assert clip_value > 0. x = y_true - y_pred if np.isinf(clip_value): return .5 * K.square(x) condition = K.abs(x) < clip_value squared_loss = .5 * K.square(x) linear_loss = clip_value * (K.abs(x) - .5 * clip_value) if K.backend() == 'tensorflow': import tensorflow as tf if hasattr(tf, 'select'): return tf.select(condition, squared_loss, linear_loss) # condition, true, false else: return tf.where(condition, squared_loss, linear_loss) # condition, true, false elif K.backend() == 'theano': from theano import tensor as T return T.switch(condition, squared_loss, linear_loss) else: raise RuntimeError('Unknown backend "{}".'.format(K.backend())) def clipped_masked_error(args): y_true, y_pred, mask = args loss = huber_loss(y_true, y_pred, 1) loss *= mask # apply element-wise mask return K.sum(loss, axis=-1) # Create trainable model. The problem is that we need to mask the output since we only # ever want to update the Q values for a certain action. The way we achieve this is by # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility # to mask out certain parameters by passing in multiple inputs to the Lambda layer. y_pred = self.net.controllerNet.output y_true = Input(name='y_true', shape=(nb_Action,)) mask = Input(name='mask', shape=(nb_Action,)) loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask]) ins = [self.net.controllerNet.input] if type(self.net.controllerNet.input) is not list else self.net.controllerNet.input trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred]) assert len(trainable_model.output_names) == 2 #combined_metrics = {trainable_model.output_names[1]: metrics} losses = [ lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics ] rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) trainable_model.compile(optimizer=rmsProp, loss=losses) self.trainable_model = trainable_model self.compiled = True def _update(self, stepCount): batches = self.memory.sample(self.nSamples, beta=beta_schedule.value(stepCount)) (stateVector, actionVector, rewardVector, nextStateVector, doneVector, importanceVector, idxVector) = batches stateVector = np.asarray(stateVector) nextStateVector = np.asarray(nextStateVector) q_values = self.net.controllerNet.predict(stateVector) assert q_values.shape == (self.nSamples, nb_Action) if self.enable_double_dqn: actions = np.argmax(q_values, axis = 1) assert actions.shape == (self.nSamples,) target_q_values = self.net.targetControllerNet.predict(nextStateVector) assert target_q_values.shape == (self.nSamples, nb_Action) q_batch = target_q_values[range(self.nSamples), actions] assert q_batch.shape == (self.nSamples,) else: target_q_values = self.net.targetControllerNet.predict(nextStateVector) q_batch = np.max(target_q_values, axis=1) assert q_batch.shape == (self.nSamples,) targets = np.zeros((self.nSamples, nb_Action)) dummy_targets = np.zeros((self.nSamples,)) masks = np.zeros((self.nSamples, nb_Action)) # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly, # but only for the affected output units (as given by action_batch). discounted_reward_batch = self.gamma * q_batch # Set discounted reward to zero for all states that were terminal. terminalBatch = np.array([1-float(done) for done in doneVector]) assert terminalBatch.shape == (self.nSamples,) discounted_reward_batch *= terminalBatch reward_batch = np.array(rewardVector) action_batch = np.array(actionVector) assert discounted_reward_batch.shape == reward_batch.shape Rs = reward_batch + discounted_reward_batch for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)): target[action] = R # update action with estimated accumulated reward dummy_targets[idx] = R mask[action] = 1. # enable loss for this specific action td_errors = targets[range(self.nSamples), action_batch] - q_values[range(self.nSamples), action_batch] new_priorities = np.abs(td_errors) + prioritized_replay_eps self.memory.update_priorities(idxVector, new_priorities) targets = np.array(targets).astype('float32') masks = np.array(masks).astype('float32') # Finally, perform a single update on the entire batch. We use a dummy target since # the actual loss is computed in a Lambda layer that needs more complex input. However, # it is still useful to know the actual target to compute metrics properly. ins = [stateVector] if type(self.net.controllerNet.input) is not list else stateVector if stepCount >= self.defaultRandomPlaySteps: loss = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets], sample_weight = [np.array(importanceVector), np.ones(self.nSamples)]) else: loss = [0.0,0.0,0.0] if stepCount > self.defaultRandomPlaySteps and stepCount % self.hard_update == 0: self.net.targetControllerNet.set_weights(self.net.controllerNet.get_weights()) return loss[1], np.mean(q_values), np.mean(np.abs(td_errors)) def update(self, stepCount): loss = self._update(stepCount) return loss def annealControllerEpsilon(self, stepCount, option_learned): if not self.randomPlay: if option_learned: self.controllerEpsilon = 0.0 else: if stepCount > self.defaultRandomPlaySteps: self.controllerEpsilon = self.exploration.value(stepCount - self.defaultRandomPlaySteps) #self.controllerEpsilon[goal] = exploration.value(stepCount - defaultRandomPlaySteps) def clear_memory(self, goal): self.learning_done = True ## Set the done learning flag del self.trainable_model del self.memory gpu = self.net.gpu del self.net gc.collect() rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) with tf.device('/gpu:'+str(gpu)): self.simple_net = Sequential() self.simple_net.add(Conv2D(32, (8,8), strides = 4, activation = 'relu', padding = 'valid', input_shape = (84,84,4))) self.simple_net.add(Conv2D(64, (4,4), strides = 2, activation = 'relu', padding = 'valid')) self.simple_net.add(Conv2D(64, (3,3), strides = 1, activation = 'relu', padding = 'valid')) self.simple_net.add(Flatten()) self.simple_net.add(Dense(HIDDEN_NODES, activation = 'relu', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED))) self.simple_net.add(Dense(nb_Action, activation = 'linear', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED))) self.simple_net.compile(loss = 'mse', optimizer = rmsProp) self.simple_net.load_weights(recordFolder+'/policy_subgoal_' + str(goal) + '.h5') self.simple_net.reset_states()
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps=40000 # max_timesteps=80000 learning_starts=1000 # buffer_size=50000 buffer_size=1000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 # batch_size=32 # batch_size=64 batch_size=512 # batch_size=1024 train_freq=1 obsShape = (8,8,1) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # # CNN version # # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( ### model = models.cnn_to_mlp_2pathways( ### convs=[(16,3,1)], # convs=[(32,3,1)], ### convs=[(32,4,1)], ### convs=[(16,4,1)], ## hiddens=[16], # hiddens=[32], # dueling=True # ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) # model = models.mlp([32]) model = models.mlp([]) q_func=model # lr=0.01 lr=0.001 # lr=0.0005 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) getqTarget = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target" ) update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) ## CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # MONTE CARLO VERSION # update rewards to actual monte carlo experiences if done: replay_buffer.update_montecarlo(gamma) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # Reshape everything to (1152,) form donesTiled = np.repeat(dones,num_deictic_patches) rewardsTiled = np.repeat(rewards,num_deictic_patches) actionsTiled = np.repeat(actions,num_deictic_patches) # # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION # qNextTarget = getqTarget(obses_tp1_deic) # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS # obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2)) # obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2)) # obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2)) # obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] # obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2)) # obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2)) # obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2)) # obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] # qCurr = getq(np.array(obses_t_deic)) # qNext = getq(np.array(obses_tp1_deic)) # actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3] # actionsTiled = actionsTiled - 4 * (actionsTiled>3) # rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled] # donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled] # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:,-1,:],1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # BELLMAN VERSION targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax # MONTE CARLO VERSION targets = rewardsTiled # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen),0,actionsTiled] = targets for i in range(num_cascade-1): mask = targets < qCurrTargets[range(expLen),i,actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), qCurrTargets ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) # deicticShape = (3,3,2) # deicticShape = (4,4,1) # deicticShape = (4,4,2) deicticShape = (4, 4, 3) # deicticShape = (3,3,4) num_deictic_patches = 25 # num_actions = 4 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( # convs=[(16,4,1)], # hiddens=[16], # dueling=True # ) # MLP version model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # obsDeictic, patchesTiledStacked2 = getDeic([obs]) # # CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq( np.reshape( obsDeictic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeicticObsBatch(obses_t) # obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: CNN version # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq( np.reshape( obses_tp1_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) qCurr = getq( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def dist_learn(env, q_dist_func, num_atoms=51, V_max=10, lr=25e-5, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.008, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=2000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=1, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.single_threaded_session() sess.__enter__() def make_obs_ph(name): print name return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = build_dist_train( make_obs_ph=make_obs_ph, dist_func=q_dist_func, num_actions=env.action_space.n, num_atoms=num_atoms, V_max=V_max, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) # act, train, update_target, debug = build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=env.action_space.n, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10 # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_dist_func': q_dist_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") print model_file # mkdir_p(os.path.dirname(model_file)) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: # print "CCCC" obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # print "Come1" # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # print "Loss : {}".format(td_errors) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: print "steps : {}".format(t) print "episodes : {}".format(num_episodes) print "mean 100 episode reward: {}".format(mean_100ep_reward) # print "mean 100 episode reward".format(mean_100ep_reward) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and t % checkpoint_freq == 0): print "==========================" print "Error: {}".format(td_errors) if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: print "Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward) # logger.log("Saving model due to mean reward increase: {} -> {}".format( # saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: print "Restored model with mean reward: {}".format( saved_mean_reward) # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def main(): # ******* Deictic parameters ******** # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape # patches in an entire image. # For example, there are 36 3x3 patches that are contained in an 8x8 observation space # (assuming no zero padding). You must set this number to correspond to deicticShape. # deicticShape = (3,3,2) # deicticShape = (3,3,4) deicticShape = (4, 4, 2) # deicticShape = (4,4,4) # num_deictic_patches = 36 num_deictic_patches = 25 # Desired network type. So far, I've done better w/ CNN WHICH_Q = "CNN" # WHICH_Q = "MLP" # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True. # OW, it doesn't converge. # PAIRED_NEXT -> use value of corresponding patch on the next step # MAX_NEXT -> use max value over all next-step patches NEXT_PATCH = "PAIRED_NEXT" # NEXT_PATCH = "MAX_NEXT" # If MIN_OVER_BATCH is true, then we find the min value over all targets that have # the same corresponding patch. In principle, this should always help. The larger # the batch size, the more it should help. However, in practice, I find that # it seems to cap the maximum achievable performance. On the other hand, it can # help convergence when using NEXT_PATCH = "MAX_NEXT". # MIN_OVER_BATCH = True MIN_OVER_BATCH = False # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade. # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is # equivalent to the standard DQN backup applied to the patches. # best here. MIN_OR_AVG_Q = "MIN" # MIN_OR_AVG_Q = "AVG" # If true, ROTATION_AUGMENTATION augments the agent's experience with # rotated versions of the patches. I typically turn this off. # ROTATION_AUGMENTATION = True ROTATION_AUGMENTATION = False # ******* Load the environment ******** env = envstandalone.StandaloneEnv() obsShape = env.observation_space.shape num_actions = env.action_space.n # ******* Standard DQN parameters ******** max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 1 lr = 0.001 batch_size = 32 train_freq = 1 num_cascade = 5 # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair num_cpu = 16 replay_buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) if MIN_OR_AVG_Q == "MIN": minoravg = -1 elif MIN_OR_AVG_Q == "AVG": minoravg = 0 else: print("error") # ******* Create neural network model ******** if WHICH_Q == "CNN": # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(32, 3, 1)], hiddens=[32], dueling=True) networkShapeOfObservation = [ -1, deicticShape[0], deicticShape[1], deicticShape[2] ] elif WHICH_Q == "MLP": # MLP version # model = models.mlp([8, 16]) model = models.mlp([16, 32]) # model = models.mlp([32]) # model = models.mlp([]) networkShapeOfObservation = [ -1, deicticShape[0] * deicticShape[1] * deicticShape[2] ] else: print("WHICH_Q error: must select valid q-function") q_func = model # ******* Build tensorflow functions ******** def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): if WHICH_Q == "CNN": return U.BatchInput(deicticShape, name=name) elif WHICH_Q == "MLP": return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) else: print("WHICH_Q error: must select valid q-function") def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1.) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() U.initialize() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # get q-values for current deictic patches obsDeictic = getDeic([obs]) qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation)) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, minoravg, :], 0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape such that patches and batches are interleaved in the same column donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: NO ROTATION-AUGMENTATION qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation)) qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation)) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS if ROTATION_AUGMENTATION: obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2)) obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2)) obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2)) obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2)) obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2)) obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2)) obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] qCurr = getq(np.array(obses_t_deic)) qNext = getq(np.array(obses_tp1_deic)) actionsTiled = np.r_[actionsTiled, actionsTiled + 1, actionsTiled + 2, actionsTiled + 3] actionsTiled = actionsTiled - 4 * (actionsTiled > 3) rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled, rewardsTiled] donesTiled = np.r_[donesTiled, donesTiled, donesTiled, donesTiled] # Get value of next state if NEXT_PATCH == "PAIRED_NEXT": qNextmax = np.max(qNext[:, minoravg, :], 1) # standard elif NEXT_PATCH == "MAX_NEXT": qNextTiled = np.reshape(qNext[:, minoravg, :], [-1, num_deictic_patches, num_actions]) qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1), num_deictic_patches) else: print("error") # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Take min over targets in same group if MIN_OVER_BATCH: obses_t_deic_reshape = np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]) unique_deic, uniqueIdx, uniqueCounts = np.unique( obses_t_deic_reshape, return_inverse=True, return_counts=True, axis=0) for i in range(np.shape(uniqueCounts)[0]): targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i]) # Copy into cascade with pruning. qCurrTargets = np.copy(qCurr) expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(expLen), i, actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic, networkShapeOfObservation), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def main(): env = envstandalone.BallCatch() max_timesteps=20000 buffer_size=50000 exploration_fraction=0.2 exploration_final_eps=0.02 print_freq=10 learning_starts=1000 gamma=.98 target_network_update_freq=500 learning_alpha = 0.2 batch_size=32 train_freq=2 deicticShape = (3,3,1) num_deictic_patches=36 num_actions = 3 episode_rewards = [0.0] # replay_buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros((obsShape[0]+2*windowLen,obsShape[1]+2*windowLen)) obsPadded[windowLen:windowLen+obsShape[0],windowLen:windowLen+obsShape[1]] = obs[:,:,0] deicticObsThis = np.zeros((windowLen,windowLen,4)) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:,:,0] = obs[i:i+windowLen,j:j+windowLen,0] == 1 # agent zoomin deicticObsThis[:,:,1] = obs[i:i+windowLen,j:j+windowLen,0] == 2 # ball zoomin patch = obsPadded[i:i+3*windowLen,j:j+3*windowLen] for k in range(1,3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])], # [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])], # [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]] deicticObs.append(deicticObsThis.copy()) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) return(np.array(deicticObsBatch)) # input: batch x nxnx1 tensor of observations # output: 8 x batch matrix of deictic observations def convertState(observations): # Reshape to batch x flatimage x channel. # Channel1 = zoomin agent, channel2 = zoomin ball # Channel3 = zoomout agent, channel4 = zoomout ball obs = np.zeros((36,9,4)) for i in range(4): obs[:,:,i] = np.reshape(observations[:,:,:,i],[36,9]) # state_numeric: 4 x batch. # row0: pos of agent in zoomin, row1: pos of ball in zoomin # row2: pos of agent in zoomout, row3: pos of ball in zoomout shape = np.shape(obs) # state_numeric = 9*np.ones((4,shape[0])) # 9 indicates agent/ball does not appear at this zoom in this glance state_numeric = 9*np.ones((shape[0],4)) # 9 indicates agent/ball does not appear at this zoom in this glance pos = np.nonzero(obs == 1) for i in range(4): idx = np.nonzero(pos[2]==i)[0] # state_numeric[i,pos[0][idx]] = pos[1][idx] state_numeric[pos[0][idx],i] = pos[1][idx] return np.int32(state_numeric) def convertStateBatch(observations): shape = np.shape(observations) state_numeric_batch = [] for batch in range(shape[0]): state_numeric_batch.append(convertState(observations[batch])) return(np.array(state_numeric_batch)) dimSize = deicticShape[0]*deicticShape[1] + 1 tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ1 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ2 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ3 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ4 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) # tabularQ5 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions]) obs = env.reset() # OHEnc = np.identity(max_num_groups) for t in range(max_timesteps): # get current q-values obsDeictic = getDeicticObs(obs) stateCurr = convertState(obsDeictic) # # do a couple of spot checks to verify that obsDeictic is correct # num2check = 17 # print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1])) # print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3])) # qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # select action qCurrNoise = qCurr + np.random.random()*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise,0)) selPatch = np.argmax(np.max(qCurrNoise,1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # env.render() # print("action: " + str(action)) # take action new_obs, rew, done, _ = env.step(action) # replay_buffer.add(obs, action, rew, new_obs, float(done)) # if done == 1: # print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew)) # action if t > max_timesteps * 1.05: print("obs:\n" + str(np.squeeze(obs))) print("qCurr:\n" + str(qCurr)) print("action: " + str(action) + ", patch: " + str(selPatch)) print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) action # if t > learning_starts and t % train_freq == 0: # obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # obses_t = np.reshape(obs,[1,8,8,1]) # obses_tp1 = np.reshape(new_obs,[1,8,8,1]) # stateCurr = convertStateBatch(getDeicticObsBatch(obses_t)) # stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1)) # qNext = tabularQ[stateNext[:,:,0], stateNext[:,:,1], stateNext[:,:,2], stateNext[:,:,3],:] # qNextmax = np.max(np.max(qNext,2),1) # targets = rew + (1-done) * gamma * qNextmax # batch_size = 1 # targets = np.tile(np.reshape(targets,[batch_size,1]),[1,num_deictic_patches]) # tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action] = np.minimum(targets, tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action]) stateNext = convertState(getDeicticObs(new_obs)) qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] qNextmax = np.max(qNext) targets = rew + (1-done) * gamma * qNextmax tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) # # get next q-values # stateNext = convertState(getDeicticObs(new_obs)) # qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] # qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:] # # perform learning update # qNextmax = np.max(qNext) # targets = rew + (1-done) * gamma * qNextmax # max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # if max_negative_td_error > 5: # max_negative_td_error # print("max_td_error: " + str(max_negative_td_error)) # print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])) # print("targets:\n" + str(targets)) # tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) # target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets1 = targets # targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] # # tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets1 # tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets2 # tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets3 # tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets4 # tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \ # (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \ # + learning_alpha * targets5 # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: # print("************************* Episode done! **************************") new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr))) print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) obs = new_obs # stop at the end of training if t > max_timesteps * 1.1: # np.set_printoptions(precision=1) # np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)}) np.set_printoptions(formatter={'float_kind':lambda x: "%.1f" % x}) # qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:] # todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape] # todisplay = np.c_[qCurr5,np.transpose(stateCurr)] print("obs:\n" + str(np.squeeze(obs))) # todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)] # print("q-values:\n" + str(todisplay)) # # print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1])) # print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3])) # print("action: " + str(action) + ", patch: " + str(selPatch)) action # print("obs:\n" + str(np.squeeze(obs))) # print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3)))) # print("action: " + str(action) + ", patch: " + str(selPatch)) # t t
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") # env = gym.make("FrozenLake8x8rob-v0") # env = gym.make("FrozenLake16x16rob-v0") env = gym.make("TestRob3-v0") # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obses_t, windowLen): deicticObses_t = [] for i in range(np.shape(obses_t)[0] - windowLen): for j in range(np.shape(obses_t)[1] - windowLen): deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:]) return np.array(deicticObses_t) # get set of deictic alternatives # input: batch x n x n x channels # output: (batch x deictic) x dn x dn x channels def getDeictic(obses_t, actions, obses_tp1, weights, windowLen): deicticObses_t = [] deicticActions = [] deicticObses_tp1 = [] deicticWeights = [] for i in range(np.shape(obses_t)[0]): for j in range(np.shape(obses_t)[1] - windowLen): for k in range(np.shape(obses_t)[2] - windowLen): deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:]) deicticActions.append(actions[i]) deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:]) deicticWeights.append(weights[i]) return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights) # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong # hiddens=[256], # used in pong # convs=[(8,4,1)], # used for non-deictic TestRob3-v0 convs=[(4,3,1)], # used for deictic TestRob3-v0 hiddens=[16], dueling=True ) # parameters q_func=model lr=1e-3 # max_timesteps=100000 # max_timesteps=50000 max_timesteps=20000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1. target_network_update_freq=500 prioritized_replay=False # prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 deicticShape = (3,3,1) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(deicticShape, name=name) matchShape = (batch_size*25,) def make_match_ph(name): return U.BatchInput(matchShape, name=name) sess = U.make_session(num_cpu) sess.__enter__() # act, train, update_target, debug = build_graph.build_train( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, debug = build_graph.build_train_deictic( # getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic( getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min( make_obs_ph=make_obs_ph, make_match_ph=make_match_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # get action to take # action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] # qvalues = getq(np.array(obs)[None]) # action = np.argmax(qvalues) # if np.random.rand() < exploration.value(t): # action = np.random.randint(env.action_space.n) deicticObs = getDeicticObs(obs,3) qvalues = getq(np.array(deicticObs)) action = np.argmax(np.max(qvalues,0)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # # temporarily take uniformly random actions all the time # action = np.random.randint(env.action_space.n) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Get batch if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None # Convert batch to deictic format obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3) obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])] _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True) # matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)] # td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic) # debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
class Agent: def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) if FLAGS.use_per: self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha) self.beta_schedule = LinearSchedule(FLAGS.beta_iters, initial_p=FLAGS.beta0, final_p=1.0) else: self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") per_weight = tf.placeholder(tf.float32, [None], "per_weight") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): # double Q negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.where(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': raise RuntimError("Needs checking.") q_target = tf.where(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target if FLAGS.use_per: ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0) else: ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_path = os.path.join(model_path, 'board', FLAGS.exp_id) summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph) if FLAGS.summary: if FLAGS.icnn_opt == 'adam': tf.summary.scalar('Q', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.summary.scalar('Q', tf.reduce_mean(q_entr)) tf.summary.scalar('Q_target', tf.reduce_mean(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight], [optimize_q, update_target, loss_q, td_error, q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun([obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(model_path + "/tf") if not FLAGS.force and ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def bundle_entropy(self, func, obs): act = np.ones((obs.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(obs, 2 * x - 1) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def adam(self, func, obs, plot=False): # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None]*3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam*a_diff + (1.-lam)*a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: #print(' + Adam took {} iterations'.format(i)) if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m/(1.-b1t) vhat = v/(1.-b2t) act -= alpha * mhat / (np.sqrt(v) + eps) # act = np.clip(act, -1, 1) act = np.clip(act, -1.+1e-8, 1.-1e-8) #print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1.+1e-8, 1.-1e-8, 100) ys = [func(obs[[0],:], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys, alpha=0.5, linestyle="--") plt.plot(hist['act'][0,0,:], hist['f'][0,:], label="Adam's trace") plt.legend() os.makedirs(os.path.join(model_path, "adam"), exist_ok=True) t = time.time() fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert(False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:,:-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:,0,:].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k') plt.legend() os.makedirs(os.path.join(model_path, "adam"), exist_ok=True) t = time.time() fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): with self.sess.as_default(): #print('--- Selecting action, test={}'.format(test)) obs = np.expand_dims(self.observation, axis=0) if FLAGS.icnn_opt == 'adam': f = self._fg_entr # f = self._fg elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: self.noise -= FLAGS.outheta*self.noise - \ FLAGS.ousigma*npr.randn(self.dimA) action += self.noise action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 if FLAGS.use_per: self.rm.add(obs1, self.action, rew, obs2, float(term)) else: self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): with self.sess.as_default(): if FLAGS.use_per: experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t)) (obs, act, rew, ob2, term2, weights, batch_idxes) = experience else: obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) #if np.random.uniform() > 0.7 and np.sum(rew > 0.0) >0 : # print("good reward samples", 100*np.sum(rew > 0.0) / FLAGS.bsize) if FLAGS.icnn_opt == 'adam': # f = self._opt_train_entr f = self._fg_entr_target # f = self._fg_target elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg_target else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) #print('--- Optimizing for training') tflearn.is_training(False) act2 = self.opt(f, ob2, plot=FLAGS.adam_plot) tflearn.is_training(True) _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, act2, term2, weights, log=FLAGS.summary, global_step=self.t) if FLAGS.use_per: new_priorities = np.abs(td_error) + FLAGS.eps self.rm.update_priorities(batch_idxes, new_priorities) self.sess.run(self.proj) return loss def negQ(self, x, y, reuse=False): szs = [FLAGS.l1size, FLAGS.l2size] assert(len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u'+str(i)) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers-1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers+1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i)) as s: zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i)) as s: z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i)) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i)) as s: z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i)) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z z = tf.reshape(z, [-1], name='energies') return z def __del__(self): self.sess.close()
def main(): env = envstandalone.TestRob3Env() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) deicticShape = (3, 3, 2) num_deictic_patches = 36 num_actions = 4 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # same as getDeictic except this one just calculates for the observation # input: n x n x channels # output: dn x dn x channels def getDeicticObs(obs): windowLen = deicticShape[0] deicticObs = [] for i in range(np.shape(obs)[0] - windowLen + 1): for j in range(np.shape(obs)[1] - windowLen + 1): # # one-channel output # deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:] # two channel output deicticObsThis = np.zeros(deicticShape) deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 10 deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 20 deicticObs.append(deicticObsThis) return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True) # # MLP version # model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([9], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,9])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: CNN version qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,9])) # qCurr = getq(np.reshape(obses_t_deic,[-1,9])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) qCurrTargets # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,9]), # qCurrTargets # ) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) if FLAGS.use_per: self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha) self.beta_schedule = LinearSchedule(FLAGS.beta_iters, initial_p=FLAGS.beta0, final_p=1.0) else: self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") per_weight = tf.placeholder(tf.float32, [None], "per_weight") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): # double Q negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.where(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': raise RuntimError("Needs checking.") q_target = tf.where(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target if FLAGS.use_per: ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0) else: ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_path = os.path.join(model_path, 'board', FLAGS.exp_id) summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph) if FLAGS.summary: if FLAGS.icnn_opt == 'adam': tf.summary.scalar('Q', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.summary.scalar('Q', tf.reduce_mean(q_entr)) tf.summary.scalar('Q_target', tf.reduce_mean(q_target)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all() # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight], [optimize_q, update_target, loss_q, td_error, q, q_target], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun([obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(model_path + "/tf") if not FLAGS.force and ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.global_variables_initializer()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("FrozenLake8x8nohole-v0") # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) robShape = (64,) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs,:] model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) # parameters q_func=model lr=1e-3 # max_timesteps=100000 max_timesteps=50000 # max_timesteps=10000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1.0 target_network_update_freq=500 # prioritized_replay=False prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) new_obs = getOneHotObs(new_obs) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() obs = getOneHotObs(obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(max_timesteps): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) env = envstandalone.BlockArrange() # Standard q-learning parameters # max_timesteps=30000 # exploration_fraction=0.3 exploration_fraction = 1 exploration_final_eps = 0.1 gamma = .90 num_cpu = 16 # Used by buffering and DQN learning_starts = 10 buffer_size = 10000 batch_size = 10 target_network_update_freq = 1 train_freq = 1 print_freq = 1 lr = 0.0003 # first two elts of deicticShape must be odd num_patches = env.maxSide**2 num_actions = 2 * num_patches # valueFunctionType = "TABULAR" valueFunctionType = "DQN" fullImageSize = (env.maxSide, env.maxSide, 1) episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) prioritized_replay = False prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)], hiddens=[48], dueling=True) def make_fullImage_ph(name): return U.BatchInput(fullImageSize, name=name) def make_target_fullstate_ph(name): return U.BatchInput([num_actions], name=name) def make_weight_fullstate_ph(name): return U.BatchInput([num_actions], name=name) if valueFunctionType == 'DQN': getqFullStateNotHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_notholding", reuse=None) getqFullStateHolding = build_getq_fullstate( make_fullImage_ph=make_fullImage_ph, q_func=q_func, num_actions=num_actions, num_cascade=1, scope="deepq", qscope="q_func_fullstate_holding", reuse=None) targetTrainFullStateNotHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_notholding", grad_norm_clipping=None, reuse=None) targetTrainFullStateHolding = build_targetTrain_fullstate( make_fullImage_ph=make_fullImage_ph, make_target_ph=make_target_fullstate_ph, make_weight_ph=make_weight_fullstate_ph, q_func=q_func, num_actions=num_actions, num_cascade=5, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_fullstate_holding", grad_norm_clipping=None, reuse=None) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get qCurr values if obs[1]: qCurr = getqFullStateHolding([obs[0]]) else: qCurr = getqFullStateNotHolding([obs[0]]) # select action at random qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) # take action new_obs, rew, done, _ = env.step(action) # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(new_obs[0]), np.copy(new_obs[1]), np.copy(float(done))) if t > learning_starts and t % train_freq == 0: states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None qNextNotHolding = getqFullStateNotHolding(states_images_tp1) qNextHolding = getqFullStateHolding(states_images_tp1) qNext = np.stack([qNextNotHolding, qNextHolding], axis=2) qNextmax = np.max(qNext[range(batch_size), :, states_discrete_tp1], axis=1) targets = rewards + (1 - dones) * gamma * qNextmax qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t) qCurrHoldingBatch = getqFullStateHolding(states_images_t) qCurrTargetBatch = np.stack( [qCurrNotHoldingBatch, qCurrHoldingBatch], axis=2) qCurrTargetBatch[range(batch_size), actions, states_discrete_t] = targets targetTrainFullStateNotHolding( states_images_t, qCurrTargetBatch[:, :, 0], np.tile(np.reshape(weights, [batch_size, 1]), [1, num_actions])) targetTrainFullStateHolding( states_images_t, qCurrTargetBatch[:, :, 1], np.tile(np.reshape(weights, [batch_size, 1]), [1, num_actions])) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) # mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = copy.deepcopy( new_obs) # without this deepcopy, RL totally fails... # save learning curve filename = 'BAR2_rewards_' + str(num_patches) + "_" + str( max_timesteps) + '.dat' np.savetxt(filename, episode_rewards)
def learn(self, total_timesteps=None, total_episodes=None, log_interval=100, ckpt_interval=100, ckpt_path=None): def _sample_episode(): sample = [] obs = self.env.reset() done = False while not done: update_eps = self.exploration.value(self.ep_done) if np.random.random_sample() > update_eps: action, value = self.policy.predict(obs, deterministic=True) else: action, value = self.policy.predict(obs, deterministic=False) new_obs, reward, done, info = self.env.step(action) sample.append((obs, action, reward)) obs = new_obs return sample episode_rewards = [] episode_successes = [] loop_var = total_timesteps if total_timesteps is not None else total_episodes if total_timesteps is not None: raise ValueError('Only total_episodes can be specified for this class') # if self.exploration_frac is None: # self.exploration = LinearSchedule(frac=self.exploration_ep, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) # else: # self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var, # initial=self.exploration_initial_eps, # final=self.exploration_final_eps) if self.exploration_type == 'linear': self.exploration = LinearSchedule( frac=self.exploration_frac * loop_var, initial=self.exploration_initial_eps, final=self.exploration_final_eps) elif self.exploration_type == 'exponential': self.exploration = ExponentialSchedule( frac=self.exploration_frac, initial=self.exploration_initial_eps, final=self.exploration_final_eps) train = True ep_reward = 0 while train: sample = _sample_episode() obses, actions, rewards = zip(*sample) self.ep_reward = np.sum(rewards) for idx in range(len(sample)): self.elapsed_steps += 1 discounts = np.array([self.gamma**i for i in range(len(obses)+1)]) expected_reward = sum(rewards[idx:]*discounts[:-(1+idx)]) - self.qvalues[obses[idx], actions[idx]] self.qvalues[obses[idx], actions[idx]] += self.learning_rate * expected_reward # print(np.where(self.qvalues!=0)) if self.policy.intent: intent_update = np.zeros(self.qvalues.shape) for obs, action in zip(obses[idx:], actions[idx:]): intent_update[obs, action] += self.learning_rate tmp = self.hvalues[obses[idx], actions[idx]] * (1-self.learning_rate) tmp += intent_update self.hvalues[obses[idx], actions[idx]] = tmp self.ep_done += 1 last_100rewards[self.ep_done%100] = ep_reward print("\rEpisode {}/{}, Average Reward {}".format(self.ep_done,total_episodes, np.nanmean(last_100rewards)),end="") # print(len(sample)) ep_reward = 0 if self.ep_done >= total_episodes: train = False if ckpt_path is not None and ckpt_interval: if loop_type == 'episode': if self.ep_done % ckpt_interval == 0 and done: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str # super(DBNModel, self).save(full_path) super(MCTabularRLModel, self).save(full_path) if loop_type == 'timesteps': if self.elapsed_steps % ckpt_interval == 0 and done: ckpt_str = str(self.ep_done) full_path = ckpt_path + '/' + ckpt_str # super(DBNModel, self).save(full_path) super(MCTabularRLModel, self).save(full_path)
def main(): np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x}) # Dictionary-based value function q_func_tabular = {} # cols of vectorKey must be boolean less than 64 bits long def getTabularKeys(vectorKey): obsBits = np.packbits(vectorKey, 1) obsKeys = 0 for i in range(np.shape(obsBits)[1]): # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big # as the bits required to encode obsBits. If it is too small, we get hash collisions... obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i]) return obsKeys def getTabular(vectorKey): keys = getTabularKeys(vectorKey) return np.array([ q_func_tabular[x] if x in q_func_tabular else 10 * np.ones(num_states) for x in keys ]) # def trainTabular(vectorKey,qCurrTargets,weights): def trainTabular(vectorKey, qCurrTargets): keys = getTabularKeys(vectorKey) alpha = 0.2 for i in range(len(keys)): if keys[i] in q_func_tabular: q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[ keys[i]] + alpha * qCurrTargets[i] # q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i] else: q_func_tabular[keys[i]] = qCurrTargets[i] env = envstandalone.BlockArrange() max_timesteps = 4000 exploration_fraction = 0.3 exploration_final_eps = 0.1 print_freq = 1 gamma = .90 num_cpu = 16 # first two elts of deicticShape must be odd actionShape = (3, 3, 2) num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions = 2 * num_patches num_actions_discrete = 2 valueFunctionType = "TABULAR" # valueFunctionType = "DQN" # actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) getMoveActionDescriptors = build_getMoveActionDescriptors( make_obs_ph=make_obs_ph, deicticShape=actionShape) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) actionDescriptors = np.r_[actionsPickDescriptors, actionsPlaceDescriptors] # Get qCurr values actionDescriptorsFlat = np.reshape( actionDescriptors, [-1, actionShape[0] * actionShape[1] * actionShape[2]]) == 1 qCurr = getTabular(actionDescriptorsFlat) qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly # select action at random if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise[:, obs[1]]) if np.random.rand() < exploration.value(t): action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _, idx, inv = np.unique(actionDescriptors, axis=0, return_index=True, return_inverse=True) actionIdx = np.argmax(qCurrNoise[idx, obs[1]]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) actionsSelected = np.nonzero(inv == actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: print("Error...") # env.render() # take action new_obs, rew, done, _ = env.step(action) # print("action: " + str(action) + ", reward: " + str(rew) + ", done: " + str(done)) # print("action patch:\n" + str(actionDescriptors[action,:])) # if done: # print("*** done ***") # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptorsNext = getMoveActionDescriptors([new_obs[0]]) moveDescriptorsNext = moveDescriptorsNext * 2 - 1 actionsPickDescriptorsNext = np.stack( [moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))], axis=3) actionsPlaceDescriptorsNext = np.stack( [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext], axis=3) actionDescriptorsNext = np.stack( [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=0) actionDescriptorsNext = np.reshape(actionDescriptorsNext, [ num_patches * num_actions_discrete, actionShape[0], actionShape[1], actionShape[2] ]) actionDescriptorsNextFlat = np.reshape( actionDescriptorsNext, [num_patches * num_actions_discrete, -1]) == 1 qNext = getTabular(actionDescriptorsNextFlat) # Calculate TD target qNextmax = np.max(qNext[:, new_obs[1]]) target = rew + (1 - done) * gamma * qNextmax # Update value function qCurrTarget = qCurr[action, :] qCurrTarget[obs[1]] = target # target avg value trainTabular([actionDescriptorsFlat[action, :]], [qCurrTarget]) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = np.copy(new_obs) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors * 2 - 1 actionsPickDescriptors = np.stack( [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3) actionsPlaceDescriptors = np.stack( [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3) print(str(obs[0][:, :, 0])) # qPickNotHolding = getqNotHolding(actionsPickDescriptors) # qPickHolding = getqHolding(actionsPickDescriptors) # qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) qPick = getTabular( np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:, 0], [8, 8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:, 1], [8, 8]))) # qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) # qPlaceHolding = getqHolding(actionsPlaceDescriptors) # qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) qPlace = getTabular( np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:, 0], [8, 8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:, 1], [8, 8])))
def main(): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) env = envstandalone.BlockArrange() # Standard q-learning parameters max_timesteps=50000 exploration_fraction=0.3 exploration_final_eps=0.1 gamma=.90 num_cpu = 16 # Used by buffering and DQN learning_starts=10 buffer_size=1 batch_size=1 target_network_update_freq=1 train_freq=1 print_freq=1 lr=0.0003 # first two elts of deicticShape must be odd # actionShape = (3,3,2) patchShape = (3,3,1) lookstackShape = (3,3,2) lookShape = (3,3,3) ppShape = (3,3,2) # num_states = 2 # either holding or not num_patches = env.maxSide**2 num_actions_discrete = 2 num_actions = num_patches + num_actions_discrete valueFunctionType = "DQN" actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions # actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected episode_rewards = [0.0] # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # prioritized_replay=True prioritized_replay=False # prioritized_replay_alpha=1.0 prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None # prioritized_replay_beta_iters=20000 prioritized_replay_eps=1e-6 if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None beta = 1 q_func = models.cnn_to_mlp( # q_func = models.cnn_to_mlp_2pathways( # convs=[(16,3,1), (32,3,1)], # hiddens=[48], convs=[(32,3,1)], hiddens=[48], # convs=[(48,3,1)], # hiddens=[48], dueling=True ) def displayLookStack(lookStack): np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x}) lookStack1 = str(lookStack[:,:,0]) lookStack1 = np.core.defchararray.replace(lookStack1,".00","") lookStack1 = np.core.defchararray.replace(lookStack1,".","") lookStack1 = np.core.defchararray.replace(lookStack1,"0",".") lookStack2 = str(lookStack[:,:,1]) lookStack2 = np.core.defchararray.replace(lookStack2,".00","") lookStack2 = np.core.defchararray.replace(lookStack2,".","") lookStack2 = np.core.defchararray.replace(lookStack2,"0",".") print("lookStack:") print(lookStack1) print(lookStack2) def make_obs_ph(name): return U.BatchInput(env.observation_space.spaces[0].shape, name=name) def make_lookDeic_ph(name): return U.BatchInput(lookShape, name=name) def make_ppDeic_ph(name): return U.BatchInput(ppShape, name=name) def make_target_ph(name): return U.BatchInput([1], name=name) def make_weight_ph(name): return U.BatchInput([1], name=name) getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=lookShape) getqLookNotHolding = build_getq( make_deic_ph=make_lookDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_LookNotHolding" ) getqLookHolding = build_getq( make_deic_ph=make_lookDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_LookHolding" ) getqPPNotHolding = build_getq( make_deic_ph=make_ppDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_PPNotHolding" ) getqPPHolding = build_getq( make_deic_ph=make_ppDeic_ph, q_func=q_func, scope="deepq", qscope="q_func_PPHolding" ) targetTrainLookNotHolding = build_targetTrain( make_deic_ph=make_lookDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_LookNotHolding", grad_norm_clipping=1. ) targetTrainLookHolding = build_targetTrain( make_deic_ph=make_lookDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_LookHolding", grad_norm_clipping=1. ) targetTrainPPNotHolding = build_targetTrain( make_deic_ph=make_ppDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_PPNotHolding", grad_norm_clipping=1. ) targetTrainPPHolding = build_targetTrain( make_deic_ph=make_ppDeic_ph, make_target_ph=make_target_ph, make_weight_ph=make_weight_ph, q_func=q_func, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func_PPHolding", grad_norm_clipping=1. ) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() lookStack = np.zeros(lookstackShape) lookStackNext = np.zeros(lookstackShape) episode_rewards = [0.0] td_errors = [0.0] timerStart = time.time() U.initialize() for t in range(max_timesteps): # Get action set: <num_patches> pick actions followed by <num_patches> place actions moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 moveDescriptors = np.reshape(moveDescriptors,[num_patches,patchShape[0],patchShape[1],patchShape[2]]) looksStackTiled = np.tile(lookStack,[num_patches,1,1,1]) lookDescriptors = np.concatenate([moveDescriptors,looksStackTiled],axis=3) if obs[1] == 0: # not holding qCurrLook = getqLookNotHolding(lookDescriptors) qCurrPP = np.r_[getqPPNotHolding([lookStack]),[[0]]] else: # holding qCurrLook = getqLookHolding(lookDescriptors) qCurrPP = np.r_[[[0]],getqPPHolding([lookStack])] qCurr = np.concatenate([qCurrLook,qCurrPP],axis=0) # select action at random qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly if actionSelectionStrategy == "UNIFORM_RANDOM": action = np.argmax(qCurrNoise) if np.random.rand() < exploration.value(t): actionClass = np.random.randint(3) if actionClass == 0: action = np.random.randint(num_patches) else: action = np.random.randint(num_patches,num_patches+2) # action = np.random.randint(num_actions) elif actionSelectionStrategy == "RANDOM_UNIQUE": _,idx,inv = np.unique(lookDescriptors,axis=0,return_index=True,return_inverse=True) idx = np.r_[idx,num_patches,num_patches+1] actionIdx = np.argmax(qCurrNoise[idx]) if np.random.rand() < exploration.value(t): actionIdx = np.random.randint(len(idx)) if actionIdx < len(idx)-2: actionsSelected = np.nonzero(inv==actionIdx)[0] action = actionsSelected[np.random.randint(len(actionsSelected))] else: action = idx[actionIdx] else: print("Error...") # take action new_obs, rew, done, _ = env.step(action) # If look action, then update look stack if action < num_patches: lookStackNext[:,:,1] = np.copy(lookStack[:,:,0]) lookStackNext[:,:,0] = np.copy(moveDescriptors[action][:,:,0]) lookAction = moveDescriptors[action] discreteAction = 0 else: lookAction = np.zeros(patchShape) discreteAction = action - num_patches print("action: " + str(action)) env.render() print("Reward: " + str(rew) + ", done: " + str(done)) displayLookStack(lookStackNext) # discrete state, look state, discrete action, look action, reward, discrete next state, look next state, done replay_buffer.add(obs[1], lookStack, discreteAction, lookAction, rew, new_obs[1], lookStackNext, new_obs[0], float(done)) lookStack = np.copy(lookStackNext) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: beta=beta_schedule.value(t) states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta) else: statesHolding_t, statesLookStack_t, actionsDiscrete, lookActions, rewards, statesHolding_tp1, statesLookStack_tp1, observations_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None moveDescriptorsNext = getMoveActionDescriptors(observations_tp1) moveDescriptorsNext = moveDescriptorsNext*2-1 moveDescriptorsNext = np.reshape(moveDescriptorsNext,[-1,patchShape[0],patchShape[1],patchShape[2]]) looksStackNextTiled = np.repeat(statesLookStack_tp1,num_patches,axis=0) lookDescriptorsNext = np.concatenate([moveDescriptorsNext,looksStackNextTiled],axis=3) # calculate qNext qNextLookNotHolding = np.max(np.reshape(getqLookNotHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1) qNextLookHolding = np.max(np.reshape(getqLookHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1) qNextPPNotHolding = getqPPNotHolding(statesLookStack_tp1) qNextPPHolding = getqPPHolding(statesLookStack_tp1) qNextNotHolding = np.max(np.c_[qNextLookNotHolding,qNextPPNotHolding],axis=1) qNextHolding = np.max(np.c_[qNextLookHolding,qNextPPHolding],axis=1) qNext = np.stack([qNextNotHolding,qNextHolding],axis=1) targets = rewards + (1-dones) * gamma * qNext[range(batch_size),statesHolding_tp1] # Calculate qCurrTarget lookDescriptors = np.concatenate([lookActions,statesLookStack_t],axis=3) qCurrLookNotHoldingT = getqLookNotHolding(lookDescriptors) qCurrLookHoldingT = getqLookHolding(lookDescriptors) qCurrPPNotHoldingT = getqPPNotHolding(statesLookStack_t) qCurrPPHoldingT = getqPPHolding(statesLookStack_t) qCurrT = np.c_[qCurrLookNotHoldingT,qCurrPPNotHoldingT,qCurrLookHoldingT,qCurrPPHoldingT] td_error = qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] - targets qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] = targets targetTrainLookNotHolding(lookDescriptors, np.reshape(qCurrT[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainPPNotHolding(statesLookStack_t, np.reshape(qCurrT[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainLookHolding(lookDescriptors, np.reshape(qCurrT[:,2],[batch_size,1]), np.reshape(weights,[batch_size,1])) targetTrainPPHolding(statesLookStack_t, np.reshape(qCurrT[:,3],[batch_size,1]), np.reshape(weights,[batch_size,1])) if prioritized_replay: new_priorities = np.abs(td_error) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) td_errors[-1] += td_error # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) td_errors.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1) mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror)) timerStart = timerFinal obs = np.copy(new_obs) # display value function obs = env.reset() moveDescriptors = getMoveActionDescriptors([obs[0]]) moveDescriptors = moveDescriptors*2-1 actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3) actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3) print(str(obs[0][:,:,0])) if valueFunctionType == "TABULAR": qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1) else: qPickNotHolding = getqNotHolding(actionsPickDescriptors) qPickHolding = getqHolding(actionsPickDescriptors) qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1) print("Value function for pick action in hold-nothing state:") print(str(np.reshape(qPick[:,0],[8,8]))) print("Value function for pick action in hold-1 state:") print(str(np.reshape(qPick[:,1],[8,8]))) if valueFunctionType == "TABULAR": qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1) else: qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors) qPlaceHolding = getqHolding(actionsPlaceDescriptors) qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1) print("Value function for place action in hold-nothing state:") print(str(np.reshape(qPlace[:,0],[8,8]))) print("Value function for place action in hold-1 state:") print(str(np.reshape(qPlace[:,1],[8,8])))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--policy_name", default="TD3") # Policy name parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name parser.add_argument("--replay_buffer", default="prioritized") # Replay Buffer type parser.add_argument("--replay_buffer_size", default=5e4, type=int) # Replay Buffer capacity parser.add_argument("--replay_buffer_alpha", default=0.6, type=float) # Replay Buffer prioritization weight parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument("--start_timesteps", default=1e4, type=int) # How many time steps purely random policy is run for parser.add_argument("--eval_freq", default=1e3, type=float) # How often (time steps) we evaluate parser.add_argument("--max_timesteps", default=5e4, type=float) # Max time steps to run environment for parser.add_argument("--save_models", default="True", type=bool) # Whether or not models are saved parser.add_argument("--expl_noise", default=0.1, type=float) # Std of Gaussian exploration noise parser.add_argument("--batch_size", default=100, type=int) # Batch size for both actor and critic parser.add_argument("--discount", default=0.99, type=float) # Discount factor parser.add_argument("--tau", default=0.005, type=float) # Target network update rate parser.add_argument("--policy_noise", default=0.2, type=float) # Noise added to target policy during critic update parser.add_argument("--noise_clip", default=0.5, type=float) # Range to clip target policy noise parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates parser.add_argument("--lr_actor", default=0.001, type=float) # Learning rate of actor parser.add_argument("--lr_critic", default=0.001, type=float) # Learning rate of critic parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float) # Replay Buffer epsilon (PRE) parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float) # Replay Buffer initial beta (PRE) args = parser.parse_args() #Training kwargs kwargs = { "policy_name": args.policy_name, "env_name": args.env_name, "replay_buffer": args.replay_buffer, "replay_buffer_size": args.replay_buffer_size, "replay_buffer_alpha": args.replay_buffer_alpha, "seed": args.seed, "start_timesteps": args.start_timesteps, "eval_freq": args.eval_freq, "max_timesteps": args.max_timesteps, "save_models": args.save_models, "expl_noise": args.expl_noise, "batch_size": args.batch_size, "discount": args.discount, "tau": args.tau, "policy_noise": args.policy_noise, "noise_clip": args.noise_clip, "policy_freq": args.policy_freq, "lr_actor": args.lr_actor, "prioritized_replay_eps": args.prioritized_replay_eps, "prioritized_replay_beta0": args.prioritized_replay_beta0 } # cls os.system('cls' if os.name == 'nt' else 'clear') if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") # Time stamp for repeated test names ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S') test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts) plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts) kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts) scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts) print("---------------------------------------") print("Settings: %s" % (test_name)) utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name)) print("---------------------------------------") # Environment and Agent instantiation env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Instantiate Replay Buffer if args.replay_buffer == "vanilla": replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size) PER = False elif args.replay_buffer == "prioritized": replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), alpha = args.replay_buffer_alpha) PER = True prioritized_replay_beta_iters = args.max_timesteps prioritized_replay_beta0 = args.prioritized_replay_beta0 beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p = prioritized_replay_beta0, final_p = 1.0) # Instantiate policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] # Training loop ####################################### total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_rewards = [] done = True while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward)) episode_rewards.append(episode_reward) # PER Beta scheduled update if PER: beta = beta_schedule.value(total_timesteps) else: beta = 0. # Policy update step if args.policy_name == "TD3": policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) # save evaluation #if args.save_models: policy.save(test_name, directory="./pytorch_models") #np.save("./results/%s" % (test_name), evaluations) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) episode_reward += reward # Push experience into replay buffer experience = (obs, action, reward, new_obs, done_bool) replay_buffer.add(experience) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy)) # Save results if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models") #np.save("./results/%s" % (evaluations_file), evaluations) #np.save("./results/%s" % ('rewards.txt'), episode_rewards) utils.save_scores(episode_rewards, "./results/%s" % (scores_name)) utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)