def agent_step(reward, state): next_state = state #Choose the next action, epsilon greedy style if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode: next_action = get_max_action_tabular(next_state) else: next_action = rand_in_range(a_globs.NUM_ACTIONS) #Update the state action values if not a_globs.is_trial_episode: next_state_max_action = a_globs.state_action_values[next_state[0]][ next_state[1]].index( max(a_globs.state_action_values[next_state[0]][next_state[1]])) a_globs.state_action_values[a_globs.cur_state[0]][ a_globs.cur_state[1]][a_globs.cur_action] += a_globs.ALPHA * ( reward + a_globs.GAMMA * a_globs.state_action_values[ next_state[0]][next_state[1]][next_state_max_action] - a_globs.state_action_values[a_globs.cur_state[0]][ a_globs.cur_state[1]][a_globs.cur_action]) a_globs.cur_state = next_state a_globs.cur_action = next_action return next_action
def agent_start(state): a_globs.cur_state = state if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode: a_globs.cur_action = get_max_action_tabular(a_globs.cur_state) else: a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS) return a_globs.cur_action
def agent_start(state): #Context is a sliding window of the previous n states that gets added to the replay buffer used by auxiliary tasks a_globs.cur_context = [] a_globs.cur_context_actions = [] a_globs.cur_state = state if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode: a_globs.cur_action = get_max_action(a_globs.cur_state) else: a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS) return a_globs.cur_action
def sample_from_buffers(buffer_one, buffer_two=None): """ Pick from one of buffer one and buffer two according to the buffer probability parameter. Then samples uniformly at random from the chosen buffer. """ #NOTE: Will have to add a check here to force sampling from non empty buffer if I decided to not wait for nonempty buffers of both types in reward/state tasks if buffer_two is None or rand_un( ) <= a_globs.BUFFER_SAMPLE_BIAS_PROBABILITY: cur_observation = buffer_one[rand_in_range(len(buffer_one))] else: cur_observation = buffer_two[rand_in_range(len(buffer_two))] return cur_observation
def agent_start(state): a_globs.cur_state = state #Choose the next action, epislon-greedy style if rand_un() < 1 - a_globs.cur_epsilon: actions = [ approx_value(a_globs.cur_state, action, a_globs.weights)[0] for action in range(a_globs.NUM_ACTIONS) ] a_globs.cur_action = actions.index(max(actions)) else: a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS) return a_globs.cur_action
def agent_step(reward, state): next_state = state #Update delta and the eligibility trace delta = reward _, a_globs.cur_state_feature_indices = approx_value( a_globs.cur_state, a_globs.cur_action, a_globs.weights) for index in a_globs.cur_state_feature_indices: delta = delta - a_globs.weights[0][index] a_globs.e_trace[0][index] = 1 #Choose the next action, epislon-greedy style if rand_un() < 1 - a_globs.cur_epsilon: actions = [ approx_value(a_globs.cur_state, action, a_globs.weights)[0] for action in range(a_globs.NUM_ACTIONS) ] next_action = actions.index(max(actions)) else: next_action = rand_in_range(a_globs.NUM_ACTIONS) #Update the a_globs.weights _, next_state_feature_indices = approx_value(next_state, next_action, a_globs.weights) for index in next_state_feature_indices: delta = delta + a_globs.GAMMA * a_globs.weights[0][index] a_globs.weights += (a_globs.ALPHA / a_globs.NUM_TILINGS) * delta * a_globs.e_trace a_globs.e_trace = a_globs.GAMMA * a_globs.TRACE * a_globs.e_trace a_globs.cur_state = next_state a_globs.cur_action = next_action # print(state) # print(reward) # print(next_action) return a_globs.cur_action
def agent_step(reward, state): next_state = state update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward, next_state) next_state_formatted = format_states([next_state]) #Choose the next action, epsilon greedy style if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode: #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a)) q_vals = get_q_vals_aux(next_state, False) next_action = np.argmax(q_vals) else: next_action = rand_in_range(a_globs.NUM_ACTIONS) do_auxiliary_learning(a_globs.cur_state, next_state, reward) if RL_num_steps() % a_globs.NUM_STEPS_TO_UPDATE == 0: update_target_network() a_globs.cur_state = next_state a_globs.cur_action = next_action return next_action
def do_auxiliary_learning(cur_state, next_state, reward): "Update the weights for the auxiliary network based on both the current interaction with the environment and sampling from experience replay" is_verbose = 0 #Perform direct learning on the current state and auxiliary information q_vals = get_q_vals_aux(cur_state, False) if next_state: #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a)) q_vals_next = get_q_vals_aux(next_state, True) cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals_next)) q_vals[0][a_globs.cur_action] = cur_action_target else: q_vals[0][a_globs.cur_action] = reward if a_globs.AGENT == a_globs.REWARD: #We make the rewards positive since we care only about the binary #distinction between zero and non zero rewards and theano binary #cross entropy loss requires targets to be 0 or 1 aux_target = np.array([[reward]]) elif a_globs.AGENT == a_globs.STATE: if next_state: aux_target = format_states([next_state]) else: aux_target = np.zeros(shape=( 1, a_globs.FEATURE_VECTOR_SIZE, )) elif a_globs.AGENT == a_globs.NOISE: aux_target = np.array([ rand_un() for i in range(a_globs.NUM_NOISE_NODES) ]).reshape(1, a_globs.NUM_NOISE_NODES) elif a_globs.AGENT == a_globs.REDUNDANT: nested_target = [q_vals for i in range(a_globs.NUM_REDUNDANT_TASKS)] aux_target = np.array([ item for sublist in nested_target for item in sublist ]).reshape(1, a_globs.NUM_ACTIONS * a_globs.NUM_REDUNDANT_TASKS) cur_state_formatted = format_states([cur_state]) #Check and see if the relevant buffer is non-empty if buffers_are_ready(a_globs.buffer_container, a_globs.BUFFER_SIZE) and not a_globs.is_trial_episode: #Create the target training batch batch_inputs = np.empty(shape=( a_globs.BATCH_SIZE, a_globs.FEATURE_VECTOR_SIZE, )) batch_targets = np.empty(shape=(a_globs.BATCH_SIZE, a_globs.NUM_ACTIONS)) batch_aux_targets = np.empty(shape=(a_globs.BATCH_SIZE, aux_target.shape[1])) #Add the current observation to the mini-batch batch_inputs[0] = cur_state_formatted batch_targets[0] = q_vals batch_aux_targets[0] = aux_target[0] #Use the replay buffer to learn from previously visited states for i in range(1, a_globs.BATCH_SIZE): cur_observation = do_buffer_sampling() #NOTE: For now If N > 1 we only want the most recent state associated #with the reward and next state (effectively setting N > 1 changes nothing right now since we want to use the same input type as in the regular singel task case) most_recent_obs_state = cur_observation.states[-1] sampled_state_formatted = format_states([most_recent_obs_state]) sampled_next_state_formatted = format_states( [cur_observation.next_state]) #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a)) q_vals = get_q_vals_aux(cur_observation.next_state, True) cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals)) #Get the value for the current state of the action which was just taken, ie Q(S, A), #and set the target for the specifc action taken (we need to pass in the #whole vector of q_values, since our network takes state only as input) q_vals = get_q_vals_aux(most_recent_obs_state, False) q_vals[0][a_globs.cur_action] = cur_action_target if a_globs.AGENT == a_globs.REWARD: #We make the rewards positive since we care only about the binary #distinction between zero and non zero rewards and theano binary #cross entropy loss requires targets to be 0 or 1 aux_target = np.array([[cur_observation.reward]]) elif a_globs.AGENT == a_globs.STATE: aux_target = format_states([cur_observation.next_state]) elif a_globs.AGENT == a_globs.NOISE: aux_target = np.array([ rand_un() for _ in range(a_globs.NUM_NOISE_NODES) ]).reshape(1, a_globs.NUM_NOISE_NODES) elif a_globs.AGENT == a_globs.REDUNDANT: nested_target = [ q_vals for _ in range(a_globs.NUM_REDUNDANT_TASKS) ] aux_target = np.array([ item for sublist in nested_target for item in sublist ]).reshape(1, a_globs.NUM_ACTIONS * a_globs.NUM_REDUNDANT_TASKS) #print(i) batch_inputs[i] = sampled_state_formatted batch_targets[i] = q_vals batch_aux_targets[i] = aux_target[0] #Update the weights using the sampled batch a_globs.model.fit(batch_inputs, [batch_targets, batch_aux_targets], batch_size=a_globs.BATCH_SIZE, epochs=1, verbose=0)
def agent_step(reward, state): next_state = state next_state_formatted = format_states([next_state]) if not a_globs.is_trial_episode: update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward, next_state) #Choose the next action, epsilon greedy style if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode: #Get the best action over all actions possible in the next state, max_a(Q(s + 1), a)) q_vals = a_globs.model.predict(next_state_formatted, batch_size=1) next_action = np.argmax(q_vals) else: next_action = rand_in_range(a_globs.NUM_ACTIONS) #Get the target value for the update from the target network q_vals = a_globs.target_network.predict(next_state_formatted, batch_size=1) cur_action_target = reward + a_globs.GAMMA * np.max(q_vals) #Get the value for the current state of the action which was just taken, ie Q(S, A), #and set the target for the specifc action taken (we need to pass in the #whole vector of q_values, since our network takes state only as input) cur_state_formatted = format_states([a_globs.cur_state]) q_vals = a_globs.model.predict(cur_state_formatted, batch_size=1) q_vals[0][a_globs.cur_action] = cur_action_target #Check and see if the relevant buffer is non-empty if buffers_are_ready(a_globs.buffer_container, a_globs.BUFFER_SIZE) and not a_globs.is_trial_episode: if (a_globs.is_trial_episode): exit("BAD!") buffer_states = [ observation.states for observation in a_globs.buffer_container[0] ] #print(buffer_states) #Create the target training batch batch_inputs = np.empty(shape=( a_globs.BATCH_SIZE, a_globs.FEATURE_VECTOR_SIZE, )) batch_targets = np.empty(shape=(a_globs.BATCH_SIZE, a_globs.NUM_ACTIONS)) #Add the current observation to the mini-batch batch_inputs[0] = cur_state_formatted batch_targets[0] = q_vals #Use the replay buffer to learn from previously visited states for i in range(1, a_globs.BATCH_SIZE): cur_observation = do_buffer_sampling() #NOTE: For now If N > 1 we only want the most recent state associated with the reward and next state (effectively setting N > 1 changes nothing right now since we want to use the same input type as in the regular singel task case) most_recent_obs_state = cur_observation.states[-1] sampled_state_formatted = format_states([most_recent_obs_state]) sampled_next_state_formatted = format_states( [cur_observation.next_state]) #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a)) q_vals = a_globs.target_network.predict( sampled_next_state_formatted, batch_size=1) cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals)) #Get the q_vals to adjust the learning target for the current action taken q_vals = a_globs.model.predict(sampled_state_formatted, batch_size=1) q_vals[0][a_globs.cur_action] = cur_action_target batch_inputs[i] = sampled_state_formatted batch_targets[i] = q_vals #Update the weights using the sampled batch if not a_globs.is_trial_episode: a_globs.model.fit(batch_inputs, batch_targets, batch_size=a_globs.BATCH_SIZE, epochs=1, verbose=0) if RL_num_steps( ) % a_globs.NUM_STEPS_TO_UPDATE == 0 and not a_globs.is_trial_episode: update_target_network() a_globs.cur_state = next_state a_globs.cur_action = next_action return next_action