def agent_start(state): """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer """ global Q, last_action, epsilon, last_state select_option = np.array([0, 1]) option = np.random.choice(select_option, p=[epsilon, 1 - epsilon]) x = state[0] y = state[1] if option == 0: action_num = rand_in_range(4) else: action_num = np.argmax(Q[y][x]) if Q[y][x][action_num] == 0: action_num = rand_in_range(4) last_action[0] = action_num action = last_action[0] last_state = state return action
def agent_start(state): global Q,last_action,last_y,last_x,model """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer list """ # pick the first action, don't forget about exploring starts x = state[0] y = state[1] if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action if Q[x][y][action_index] == 0: action_index = rand_in_range(4) last_action = action_index last_x = x last_y = y action = actions[action_index] return action
def agent_step( reward, this_observation ): # returns NumPy array, reward: floating point, this_observation: NumPy array global last_action #the action at this time step taken_action = int(this_observation[0]) #how many times this action has been taken in current run if np.sum(op_values) != 0: op_values[taken_action] = op_values[taken_action] + ( reward - op_values[taken_action]) / 10.0 # might do some learning here last_action[0] = np.argmax(op_values) return last_action #the estimate the action value estimate_values[taken_action] = estimate_values[taken_action] + ( reward - estimate_values[taken_action]) / 10.0 # might do some learning here current_op_action = np.argmax(estimate_values) epsilon = rand_in_range(10) if epsilon == 0: last_action[0] = rand_in_range(num_actions) else: last_action[0] = current_op_action return last_action
def agent_start(this_observation ): # returns NumPy array, this_observation: NumPy array global last_action last_action[0] = rand_in_range(num_actions) local_action = np.zeros(1, dtype='int32') local_action[0] = rand_in_range(num_actions) return local_action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: floating point """ global Q, last_action, epsilon, alpha, last_state, gamma, pre_obs_state, pre_obs_action select_option = np.array([0, 1]) option = np.random.choice(select_option, p=[epsilon, 1 - epsilon]) x = state[0] y = state[1] if option == 0: action = rand_in_range(4) # change this to 9 to rand in 9 actions else: action = np.argmax(Q[y][x]) if Q[y][x][action] == 0: action = rand_in_range(4) if state not in pre_obs_state: pre_obs_state.append(state) pre_obs_action[tuple(state)] = [] if action not in pre_obs_action[tuple(state)]: pre_obs_action[tuple(state)].append(action) Q[last_state[1]][last_state[0]][last_action] += alpha * ( reward + gamma * np.max(Q[y][x]) - Q[last_state[1]][last_state[0]][last_action]) model[last_state[1]][last_state[0]][last_action] = [x, y, reward] for i in range(n): rand_index = rand_in_range(len(pre_obs_state)) S_x = pre_obs_state[rand_index][0] S_y = pre_obs_state[rand_index][1] index = rand_in_range(len(pre_obs_action[(S_x, S_y)])) rand_action = pre_obs_action[(S_x, S_y)][index] next_state = [ model[S_y][S_x][rand_action][0], model[S_y][S_x][rand_action][1] ] Rwd = model[S_y][S_x][rand_action][2] Q[S_y][S_x][rand_action] += alpha * ( Rwd + gamma * np.max(Q[next_state[1]][next_state[0]]) - Q[S_y][S_x][rand_action]) last_action = action last_state = state return action
def sample_from_buffers(buffer_one, buffer_two=None): """ Sample a transiton uniformly at random from one of buffer_one and buffer_two. Which buffer is sampled is dependent on the current time step, and done in a way so as to sample equally from both buffers throughout an episode" """ if RL_num_steps() % 2 == 0 or buffer_two is None: cur_transition = buffer_one[rand_in_range(len(buffer_one))] else: cur_transition = buffer_two[rand_in_range(len(buffer_two))] return cur_transition
def agent_start(this_observation ): # returns NumPy array, this_observation: NumPy array global last_action last_action[0] = rand_in_range(num_actions) local_action = np.zeros(1) local_action[0] = rand_in_range(num_actions) # return local_action[0] return agent.pick_action()
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q,last_action,last_y,last_x,model """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q # s'x and y x = state[0] y = state[1] Q[last_x][last_y][last_action] += alpha_step * (reward+ gamma*np.max(Q[x][y]) - Q[last_x][last_y][last_action]) modelKey = (last_x,last_y,last_action) model[modelKey] = [reward,x,y] i = 0 while i < n: i += 1 chosen = False while not chosen: modelX = rand_in_range(9) modelY = rand_in_range(6) modelA = rand_in_range(4) if model[(modelX,modelY,modelA)][0] != -1.0: chosen = True modelNextY = model[(modelX,modelY,modelA)][2] modelNextX = model[(modelX,modelY,modelA)][1] modelReward = model[(modelX,modelY,modelA)][0] Q[modelX][modelY][modelA] += alpha_step * (modelReward +gamma*np.max(Q[modelNextX][modelNextY]) - Q[modelX][modelY][modelA]) if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action if Q[x][y][action_index] == 0: action_index = rand_in_range(4) last_x = x last_y = y last_action = action_index action = actions[action_index] return action
def epsilon_greedy(state1, Q1): global action #rand_in_range(10)>0 means 90% agent goes for greedy choice if rand_in_range(10) > 0: #find the action with the largest estimated Q value action_index = choose_random_largest(Q1[state1[0]][state1[1]]) else: #10% agent choose from random action_index = rand_in_range(len(action)) return action_index
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: floating point """ global Q, last_action, epsilon, alpha, last_state, gamma, n select_option = np.array([0, 1]) option = np.random.choice(select_option, p=[epsilon, 1 - epsilon]) current_x = state[0] current_y = state[1] if option == 0: action_num = rand_in_range(4) # change this to 9 to rand in 9 actions else: action_num = np.argmax(Q[current_y][current_x]) if Q[current_y][current_x][action_num] == 0: action_num = rand_in_range(4) Q[last_state[1]][last_state[0]][last_action[0]] += alpha * ( reward + gamma * np.max(Q[current_y][current_x]) - Q[last_state[1]][last_state[0]][last_action[0]]) model[(last_state[1], last_state[0])][last_action[0]] = [current_x, current_y, reward] for i in range(n): exist = False while not exist: model_x = rand_in_range(9) model_y = rand_in_range(6) model_action = rand_in_range(4) if model[(model_y, model_x)][model_action][2] != -1: exist = True S_x = model[(model_y, model_x)][model_action][0] S_y = model[(model_y, model_x)][model_action][1] Rwd = model[(model_y, model_x)][model_action][2] Q[model_y][model_x][model_action] += alpha * ( Rwd + gamma * np.max(Q[S_y][S_x]) - Q[model_y][model_x][model_action]) last_action[0] = action_num last_state = state action = last_action[0] return action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q global last_x, last_y, last_action, Model, visited, Q x, y = state Q[last_x, last_y, last_action] += alpha * (reward + gamma * np.max(Q[x, y, :]) - Q[last_x, last_y, last_action]) Model[last_x, last_y, last_action, 0] = x Model[last_x, last_y, last_action, 1] = y Model[last_x, last_y, last_action, 2] = reward visited.append((last_x, last_y, last_action)) Dyna_Q() #action = np.argmax(Q[x, y, :]) action = np.random.choice(np.where(Q[x, y, :] == Q[x, y, :].max())[0]) if rand_un() < epsilon: action = rand_in_range(4) #print state, action last_x = x last_y = y last_action = action return last_action
def agent_step(reward, state): global old_action, old_state, Q, old_action, a # chose action e greedy action = action_select(state) old0, old1 = old_state[0], old_state[1] gamma_Q = g * np.amax(Q[state[0]][state[1]]) #updating Q Q[old0][old1][old_action] += a * (reward + gamma_Q - Q[old0][old1][old_action]) #updating the model model[old0][old1][old_action] = np.array([state[0], state[1], int(reward)]) if n != 0: for i in range(n): rs = visited[rand_in_range(len(visited))] xyr = model[rs[0][0]][rs[0][1]][rs[1]] Q[rs[0][0]][rs[0][1]][rs[1]] += a * (xyr[2] + ( g * np.amax(Q[xyr[0]][xyr[1]])) - Q[rs[0][0]][rs[0][1]][rs[1]]) #updating old action and state old_action = action old_state[0] = state[0] old_state[1] = state[1] #updating visited if not already visited visit = [[state[0], state[1]], action] if visit not in visited: visited.append(visit) # update old action and state return action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q, last_action, last_state, total_actions """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q select_option = np.array([0, 1]) option = np.random.choice(select_option, p=[epsilon, 1 - epsilon]) current_x = state[0] current_y = state[1] last_x = last_state[0] last_y = last_state[1] if option == 0: action = rand_in_range(total_actions) else: action = np.argmax(Q[state[0]][state[1]]) Q[last_x][last_y][last_action] += alpha_step * ( reward + Q[current_x][current_y][action] - Q[last_x][last_y][last_action]) last_action = action last_x = current_x last_y = current_y last_state = [last_x, last_y] return action
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q global Q, last_action, S, S_, model, previous_states S_ = state if (S[0],S[1]) not in previous_states: previous_states[(S[0],S[1])] = set() previous_states[(S[0],S[1])].add(last_action) Q[S[0]][S[1]][last_action] += alpha * (reward + gamma * max(Q[S_[0]][S_[1]]) - Q[S[0]][S[1]][last_action]) model[S[0]][S[1]][last_action] = (reward, S_[0], S_[1]) for i in range(n): S_planning = random.choice(previous_states.keys()) A_planning = random.sample(previous_states[S_planning], 1) reward_planning, x_planning, y_planning = model[S_planning[0]][S_planning[1]][A_planning[0]] Q[S_planning[0]][S_planning[1]][A_planning[0]] += alpha * (reward_planning + gamma * max(Q[x_planning][y_planning]) - Q[S_planning[0]][S_planning[1]][A_planning[0]]) if rand_un() < epsilon: action = rand_in_range(4) else: action = argmax(Q[S[0]][S[1]]) S = S_ last_action = action return action
def pick_action(arr, epsilon, num_actions): arg_max = np.argmax(arr) if rand_un() < epsilon: action = rand_in_range(num_actions) else: action = arg_max return action
def agent_step(reward, state): global cur_state, cur_action, weights, e_trace next_state = state #Update the weights delta = reward cur_state_feature_indices = approx_value(cur_state, cur_action, weights)[1] for index in cur_state_feature_indices: delta = delta - weights[0][index] e_trace[0][index] = 1 #Choose the next action, epislon-greedy style if rand_un() < 1 - EPSILON: actions = [ approx_value(cur_state, action, weights)[0] for action in range(NUM_ACTIONS) ] next_action = actions.index(max(actions)) else: next_action = rand_in_range(NUM_ACTIONS) next_state_feature_indices = approx_value(next_state, next_action, weights)[1] for index in next_state_feature_indices: delta = delta + GAMMA * weights[0][index] weights += ALPHA * delta * e_trace e_trace = GAMMA * LAMBDA * e_trace cur_state = next_state cur_action = next_action return cur_action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q global w global z global last_state global last_action error = reward for t in my_tiles(last_state, last_action): error -= w[t] z[t] = 1 prob = np.random.rand() if prob < epsilon: action = rand_in_range(3) else: action = max_q_hat(state) for t in my_tiles(state, action): error += discount * w[t] for i in range(memorySize): w[i] += alpha * error * z[i] z[i] = z[i] * discount * lamb last_state = state last_action = action return action
def agent_end(reward): """ Arguments: reward: floating point Returns: Nothing """ global Q, returns, path # do learning and update pi for stop in path: # print(stop) if stop in returns: returns[stop].append(reward) else: returns[stop] = [reward] for key in returns: Q[key[0]][key[1]] = (sum(returns[(key[0], key[1])]) / len(returns[(key[0], key[1])])) # print() for state in range(1, 100): if np.argmax(Q[state]) == 0: pi[state] = rand_in_range(min(state, 100 - state)) + 1 else: pi[state] = np.argmax(Q[state]) return
def agent_step(reward, state): global q_values, actions, old_info, reward_buffer, gammas x = state[0][0] y = state[0][1] hash_state = y * 10 + x #epsilon greedy rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) #learning reward_buffer.append(reward) if len(old_info) >= N: old_state = old_info[0][0] old_action = old_info[0][1] q_values[old_state, old_action] += ALPHA * (np.sum(gammas * np.asarray(reward_buffer))+\ (GAMMA**(N)) * q_values[hash_state, action] - q_values[old_state, old_action]) old_info.pop(0) reward_buffer.pop(0) old_info.append((hash_state, action)) return actions[action]
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q, actions, last_action, last_y, last_x """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q # s'x and y x = state[0] y = state[1] if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action #update last step's Q Q[last_x][last_y][last_action] += alpha_step * ( reward + Q[x][y][action_index] - Q[last_x][last_y][last_action]) last_x = x last_y = y last_action = action_index action = actions[action_index] return action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ global alpha, gamma, actions_permitted, Q, action_list, last_action, last_state # select an action, based on Q if rand_un() < epsilon: action = action_list[rand_in_range(actions_permitted)] else: action = action_list[np.argmax(Q[int(state[0])][int(state[1])])] Q[last_state[0], last_state[1], find_action(last_action)] += alpha * ( reward + gamma * Q[int(state[0]), int(state[1]), find_action(action)] - Q[last_state[0], last_state[1], find_action(last_action)]) last_action = action last_state = state return action
def agent_step(reward, state): global q_values, actions, trajectory x = state[0][0] y = state[0][1] hash_state = y * 10 + x #epsilon greedy rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) #print state, action #learning n = len(trajectory) for i, (s, a) in enumerate(trajectory): #Wn = math.exp(-0.5*(g - (r+self.values[ns]))**2) q_values[s, a] += 1. / N[s, a] * (GAMMA**(n - 1 - i)) * ( reward + GAMMA * q_values[hash_state, action] - q_values[s, a]) old_state = hash_state old_action = action #if (hash_state, action) not in trajectory: trajectory.append((hash_state, action)) N[hash_state, action] += 1 return actions[action]
def agent_step(reward, state): """ Arguments: reward: floating point, state: integer Returns: action: integer """ global Q, last_action, last_state # choose A' from S' using policy derived from Q # 0 represent exploration, 1 represent exploitation choice = np.array([0, 1]) result = np.random.choice(choice, p=[epsilon, 1 - epsilon]) if result == 0: # exploration action = rand_in_range(num_actions) else: # exploitation action = np.argmax(Q[state[0], state[1], :]) Q[last_state[0], last_state[1], last_action] = Q[last_state[0], last_state[1], last_action] + \ alpha*(reward + Q[state[0], state[1], action] - Q[last_state[0], last_state[1], last_action]) last_state = state last_action = action return action
def env_start(): """ returns numpy array """ global current_state state = rand_in_range( num_total_states) + 1 # This is required for exploring starts current_state = np.asarray([state]) return current_state
def action_select(s): if rand_un() < e: #explore return rand_in_range(4) else: #decide action based on policy return np.random.choice( np.where(Q[s[0]][s[1]] == np.amax(Q[s[0]][s[1]]))[0])
def agent_step(reward, state): global state_action_values, cur_state, cur_action, cur_epsilon next_state = state #Choose the next action, epsilon greedy style if AGENT == TABULAR: if rand_un() < 1 - cur_epsilon: #Need to ensure that an action is picked uniformly at random from among those that tie for maximum cur_max = state_action_values[state[0]][state[1]][0] max_indices = [0] for i in range(1, len(state_action_values[state[0]][state[1]])): if state_action_values[state[0]][state[1]][i] > cur_max: cur_max = state_action_values[state[0]][state[1]][i] max_indices = [i] elif state_action_values[state[0]][state[1]][i] == cur_max: max_indices.append(i) next_action = max_indices[rand_in_range(len(max_indices))] else: next_action = rand_in_range(NUM_ACTIONS) #Update the state action values next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]])) state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action]) elif AGENT == NEURAL: #Choose the next action, epsilon greedy style if rand_un() < 1 - cur_epsilon: #Get the best action over all actions possible in the next state, q_vals = model.predict(encode_1_hot(next_state), batch_size=1) q_max = np.max(q_vals) next_action = np.argmax(q_vals) cur_action_target = reward + GAMMA * q_max #Get the value for the current state for which the action was just taken cur_state_1_hot = encode_1_hot(cur_state) q_vals = model.predict(cur_state_1_hot, batch_size=1) q_vals[0][cur_action] = cur_action_target model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0) else: next_action = rand_in_range(NUM_ACTIONS) cur_state = next_state cur_action = next_action return next_action
def agent_start(state): global state_action_values, cur_state, cur_action if AGENT == TABULAR: #All value functions are initialized to zero, so we can just select randomly for the first action, since they all tie cur_action = rand_in_range(NUM_ACTIONS) elif AGENT == NEURAL: cur_action = get_max_action(state) return cur_action
def agent_start(state): # pick the first action, don't forget about exploring starts global action_hist #choose a random action action = rand_in_range(min(state[0], 100 - state[0])) + 1 action_hist[state[0] - 1][action - 1] += 1 return action
def agent_start(this_observation ): # returns NumPy array, this_observation: NumPy array global last_action, action_times, estimate_values, op_values #,op_init #op_init=0 action_times = np.zeros(10) estimate_values = np.zeros(10) op_values = np.zeros(10) if this_observation[0] != 0: for i in range(num_actions): op_values[i] = this_observation[0] local_action = np.zeros(1) local_action[0] = rand_in_range(num_actions) return local_action #last_action[0] = rand_in_range(num_actions) local_action = np.zeros(1) local_action[0] = rand_in_range(num_actions) last_action[0] = local_action[0] return local_action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q, last_action, last_y, last_x, model, theta, PQueue, model2 """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q # s'x and y x = state[0] y = state[1] modelKey = (last_x, last_y, last_action) model[modelKey] = (reward, x, y) model2Key = (x, y) if (reward, last_x, last_y, last_action) not in model2[model2Key]: model2[model2Key].append((reward, last_x, last_y, last_action)) p = reward + gamma * np.max(Q[x][y]) - Q[last_x][last_y][last_action] if p > theta: PQueue.put((p, last_x, last_y, last_action)) i = 0 while i < 5 and not PQueue.empty(): i += 1 firstTuple = PQueue.get() key = firstTuple[1:4] (modelX, modelY, modelA) = key (modelReward, modelNextX, modelNextY) = model[key] Q[modelX][modelY][modelA] += alpha_step * ( modelReward + gamma * np.max(Q[modelNextX][modelNextY]) - Q[modelX][modelY][modelA]) key2 = (modelX, modelY) for item in model2[key2]: (previousReward, previousX, previousY, previousA) = item p = previousReward + gamma * np.max( Q[modelX][modelY]) - Q[previousX][previousY][previousA] if p > theta: PQueue.put((p, previousX, previousY, previousA)) action_index = np.argmax(Q[x][y]) #find best action if Q[x][y][action_index] == 0: action_index = rand_in_range(4) last_x = x last_y = y last_action = action_index action = actions[action_index] return action