def agent_step( reward, this_observation ): # returns NumPy array, reward: floating point, observation_t: NumPy array global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C #Update estimate for current action cur_action = int(this_action[0]) action_value_estimates[cur_action] += alpha * ( reward - action_value_estimates[cur_action]) #Choose a new action using the parameter based agents stp1 = this_observation[0] action_selection_prob = rand_un() if action_selection_prob <= (1 - epsilon): atp1 = action_value_estimates.index(max(action_value_estimates)) elif episode == EPSILON_GREEDY: atp1 = randInRange(numActions) else: print("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!") exit() action_counts[atp1] += 1 time_step += 1 local_action[0] = atp1 this_action = local_action last_observation = this_observation return this_action
def agent_start(this_observation ): # returns NumPy array, this_observation: NumPy array global local_action, last_observation, this_action, episode, action_value_estimates, action_counts, epsilon, Q1, time_step #Set the parameters based on the current agent #We use episodes to distinguish between agent parameter settings if episode == OPTIMISTIC_INIT: epsilon = 0.0 Q1 = 5 elif episode == EPSILON_GREEDY: epsilon = 0.1 Q1 = 0 else: exit("BAD EPISODE: NO STRATEGY FOR THE CURRENT AGENT!!!") action_value_estimates = [Q1 for action in range(numActions)] action_counts = [0 for action in range(numActions)] stp1 = this_observation[ 0] # how you convert observation to a number, if state is tabular atp1 = randInRange(numActions) action_counts[atp1] += 1 local_action[0] = atp1 last_observation = this_observation # save observation, might be useful on the next step this_action = local_action time_step = 1 return this_action
def agent_start(this_observation): # returns NumPy array, this_observation: NumPy array global local_action, last_observation, this_action#, numActions stp1 = this_observation[0] # how you convert observation to a number, if state is tabular atp1 = randInRange(numActions) local_action[0] = atp1 last_observation = this_observation # save observation, might be useful on the next step this_action = local_action return this_action
def agent_step(reward, this_observation): # returns NumPy array, reward: floating point, observation_t: NumPy array global local_action, last_observation, this_action#, numActions stp1 = this_observation[0] atp1 = randInRange(numActions) # might do some learning here local_action[0] = atp1 this_action = local_action last_observation = this_observation return this_action
def env_step(this_action): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global local_observation, this_reward_observation, arms#, nStatesSimpleEnv episode_over = False atp1 = this_action[0] # how to extact action stp1 = randInRange(nStatesSimpleEnv) # state transitions are uniform random the_reward = randn(0.0, 1.0) + arms[int(atp1)] # rewards drawn from (0, 1) Gaussian #if rand_un() < 0.05: # episode_over = True # termination is random local_observation[0] = stp1 this_reward_observation = (the_reward, this_reward_observation[1], episode_over) return this_reward_observation
def agent_step( reward, this_observation ): # returns NumPy array, reward: floating point, observation_t: NumPy array global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C #Update estimate for current action cur_action = int(this_action[0]) action_value_estimates[cur_action] += alpha * ( reward - action_value_estimates[cur_action]) #Choose a new action using the parameter based agents stp1 = this_observation[0] action_selection_prob = rand_un() if episode == EPSILON_GREEDY: if action_selection_prob <= (1 - epsilon): atp1 = action_value_estimates.index(max(action_value_estimates)) else: atp1 = randInRange(numActions) elif episode == UCB: action_value_estimates_copy = list(action_value_estimates) cur_greedy_action_index = action_value_estimates_copy.index( max(action_value_estimates_copy)) action_counts_copy = list(action_counts) #Compute the uncertainty measure for each action value estimate, and select the next action action_values_UCB = [] for i in range(len(action_value_estimates_copy)): cur_UCB_action_value = action_value_estimates_copy[i] + ( C * (math.sqrt(math.log(time_step) / (action_counts_copy[i] + 1)))) action_values_UCB.append(cur_UCB_action_value) atp1 = action_values_UCB.index(max(action_values_UCB)) else: exit("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!") action_counts[atp1] += 1 time_step += 1 local_action[0] = atp1 this_action = local_action last_observation = this_observation return this_action
def agent_start(this_observation ): # returns NumPy array, this_observation: NumPy array global local_action, last_observation, this_action, episode, action_value_estimates, action_counts, epsilon, Q1, time_step action_value_estimates = [Q1 for action in range(numActions)] action_counts = [0 for action in range(numActions)] stp1 = this_observation[ 0] # how you convert observation to a number, if state is tabular atp1 = randInRange(numActions) action_counts[atp1] += 1 local_action[0] = atp1 last_observation = this_observation # save observation, might be useful on the next step this_action = local_action time_step = 1 return this_action
def env_step( this_action ): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global local_observation, this_reward_observation #, nStatesSimpleEnv episode_over = False #Get a reward from the current action reward distribution atp1 = int(this_action[0]) # how to extact action the_reward = randn(bandit_action_values[atp1], 1.0) # rewards drawn from (q*, 1) Gaussian stp1 = randInRange( nStatesSimpleEnv) # state transitions are uniform random ######### local_observation[0] = stp1 this_reward_observation = (the_reward, this_reward_observation[1], episode_over) return this_reward_observation