def flow_control_test(nstates, nepisodes): n = 8 agent_actions = 2 gate_actions = 2 ncolors = 2 nslots = 1 nroles = 1 states = hrr.hrrs(n, nstates) colors = hrr.hrrs(n, ncolors) # external cue colors = np.row_stack((colors, identity_vector(n))) roles = hrr.hrrs(n, nroles) roles = np.row_stack((roles, identity_vector(n))) # preconvolve states role_state = hrr.oconvolve(roles, states) role_state = np.reshape(role_state, (nroles + 1, nstates, n)) cue_state = hrr.oconvolve(colors, states) cue_state = np.reshape(cue_state, (ncolors + 1, nstates, n)) #print(role_state.shape) #print(cue_state.shape) # create objects agent = RL_Obj(n, agent_actions) i_gate = RL_Obj(n, gate_actions) o_gate = RL_Obj(n, gate_actions) WM = wm_content(n, ncolors, nslots) for episode in range(nepisodes): state = random.randrange(0, nstates) color_signal = random.randrange(0, ncolors) role_i = 0 # role is available slot = 0 # slot number in use i_gate_input = role_state[role_i, state, :] # input for in_gate i_gate_state, i_value, i_input = i_gate.action(i_gate_input) print('color_cue:', color_signal) print('i_gate_state:', i_gate_state) WM.wm_in_flow(i_gate_state, slot, color_signal) # control flow of wm maint contents print('wm_maint:', WM.get_wm_maint_statistics()[slot]) print('wm_maint_contents:', WM.get_one_wm_maint(slot)) role_o = 1 if WM.wm_maint_slot_is_empty(slot) else 0 print('role_o:', role_o) o_gate_input = role_state[role_o, state, :] # input for out_gate o_gate_state, o_value, o_input = i_gate.action(o_gate_input) print('o_gate_state:', o_gate_state) WM.wm_out_flow(o_gate_state, slot) # control flow of wm out contents print('wm_output:', WM.get_wm_output_statistics()[slot]) wm_out = WM.get_one_wm_output(slot) # wm out contents for given slot print(wm_out) agent_input = hrr.convolve(cue_state[color_signal, state], wm_out) action, a_value, a_input = agent.action(agent_input) print('action:', action) print() print()
def action(self, state_space): mystate = hrr.convolve(state_space, self.actions) values = np.dot(mystate, self.W) + self.bias #print(values) sm = softmax(values) action = np.argmax(sm) if random.random() < self.epsilon: action = random.randrange(0, self.nactions) # force gate to be closed #if close: # action = 0 #x = hrr.convolve(mystate,self.actions[action]) # input x = mystate[action] # input #return action, values[action], my_state2[action] return action, values[action], x
def testing_maze(nstates, nepisodes, stat_window): n = 128 nactions = 2 goal = 0 reward = np.zeros(nstates) reward[goal] = 1 states = hrr.hrrs(n, nstates) agent = RL_Obj(n, nactions) nsteps = 100 opt_array = [] diff_sum = 0 mycount = 0 for episode in range(nepisodes): mycount += 1 print('episode:', episode) state = random.randrange(0, nstates) #print('state:',state) action, value, my_input = agent.action(states[state]) agent.set_eligibility_zero() optimal_steps = optimal_path(state, goal, nstates) #print('optimal steps',optimal_steps) for step in range(nsteps): r = reward[state] if state == goal: agent.eligibility_trace_update(my_input) agent.td_update_goal(r, value) break pstate = state pvalue = value #paction = action agent.eligibility_trace_update(my_input) state = ((state + np.array([-1, 1])) % nstates)[action] action, value, my_input = agent.action(states[state]) agent.td_update(r, value, pvalue) step_diff = abs(step - optimal_steps) print('step_dif:', step_diff) diff_sum += step_diff if episode % stat_window == 0: mean_diff = diff_sum / mycount opt_array.append(mean_diff) mycount = 0 diff_sum = 0 V1 = list( map(lambda x: np.dot(x, agent.W) + agent.bias, hrr.convolve(states, agent.actions[0]))) V2 = list( map(lambda x: np.dot(x, agent.W) + agent.bias, hrr.convolve(states, agent.actions[1]))) plotly.offline.plot({ "data": [ Scatter(x=[x for x in range(len(V1))], y=V1, name='left'), Scatter(x=[x for x in range(len(V2))], y=V2, name='right') ], "layout": Layout(title="", xaxis=dict(title="state"), yaxis=dict(title="Q(s,a)")) }) plt.plot(opt_array) plt.show()
def color_maze_task(nstates, nepisodes, stat_window): n = 128 agent_actions = 2 gate_actions = 2 ncolors = 2 nslots = 1 nroles = 1 # goals and rewards goal = [0, nstates // 2, None] reward = np.zeros((ncolors + 1, nstates)) for x in range(ncolors): reward[x, goal[x]] = 1 ##### # punishment based reward ''' reward = np.ones((ncolors+1,nstates)) reward *= -1 for x in range(ncolors): reward[x,goal[x]] = 0 ''' states = hrr.hrrs(n, nstates) colors = hrr.hrrs(n, ncolors) # external cue colors = np.row_stack((colors, identity_vector(n))) roles = hrr.hrrs(n, nroles) roles = np.row_stack((roles, identity_vector(n))) # preconvolve states role_state = hrr.oconvolve(roles, states) role_state = np.reshape(role_state, (nroles + 1, nstates, n)) cue_state = hrr.oconvolve(colors, states) cue_state = np.reshape(cue_state, (ncolors + 1, nstates, n)) agent = RL_Obj(n, agent_actions) i_gate = Gate(n, gate_actions) o_gate = Gate(n, gate_actions) WM = wm_content(n, ncolors, nslots) nsteps = 100 opt_array = [] diff_sum = 0 mycount = 0 for episode in range(nepisodes): print('episode:', episode) mycount += 1 state = random.randrange(0, nstates) color_signal = random.randrange(0, ncolors) color = color_signal optimal_steps = optimal_path(state, goal[color_signal], nstates) # tracks number of optimal steps role_i = 0 # role is available slot = 0 # slot number in use forced_igate_state = None forced_ogate_state = None WM.flush_all_wm_maint() i_gate_input = role_state[role_i, state, :] # input for in_gate i_gate_state, i_value, i_input = i_gate.action(i_gate_input, forced_igate_state) WM.wm_in_flow(i_gate_state, slot, color_signal) # control flow of wm maint contents role_o = 1 if WM.wm_maint_slot_is_empty(slot) else 0 o_gate_input = role_state[role_o, state, :] # input for out_gate o_gate_state, o_value, o_input = o_gate.action(o_gate_input, forced_ogate_state) WM.wm_out_flow(o_gate_state, slot) # control flow of wm out contents wm_out = WM.get_one_wm_output(slot) # wm out contents for given slot agent_input = hrr.convolve(cue_state[color_signal, state], wm_out) action, a_value, a_input = agent.action(agent_input) i_gate.set_eligibility_zero() o_gate.set_eligibility_zero() agent.set_eligibility_zero() # clear wm output WM.flush_all_wm_output() for step in range(nsteps): r = reward[color, state] if state == goal[color]: i_gate.eligibility_trace_update(i_input) o_gate.eligibility_trace_update(o_input) agent.eligibility_trace_update(a_input) i_gate.td_update_goal(r, i_value) o_gate.td_update_goal(r, o_value) agent.td_update_goal(r, a_value) break pstate = state # maze state p_i_value = i_value # Q val for input gate p_o_value = o_value # Q val for output gate p_a_value = a_value # Q val for agent # update eligibility traces i_gate.eligibility_trace_update(i_input) o_gate.eligibility_trace_update(o_input) agent.eligibility_trace_update(a_input) # change state in maze by taking action state = ((state + np.array([-1, 1])) % nstates)[action] # turn off cue color_signal = 2 role_i = 1 # role is unavailable forced_igate_state = 'Closed' forced_ogate_state = 'Open' i_gate_input = role_state[role_i, state, :] # input for in_gate i_gate_state, i_value, i_input = i_gate.action( i_gate_input, forced_igate_state) # WM.wm_in_flow(i_gate_state, slot, color_signal) # control flow of wm maint contents role_o = 1 if WM.wm_maint_slot_is_empty( slot) else 0 # checks if role is available in wm_maint o_gate_input = role_state[role_o, state, :] # input for out_gate o_gate_state, o_value, o_input = o_gate.action( o_gate_input, forced_ogate_state) WM.wm_out_flow(o_gate_state, slot) # control flow of wm out contents wm_out = WM.get_one_wm_output( slot) # wm out contents for given slot agent_input = hrr.convolve(cue_state[color_signal, state], wm_out) action, a_value, a_input = agent.action(agent_input) # td update i_gate.td_update(r, i_value, p_i_value) o_gate.td_update(r, o_value, p_o_value) agent.td_update(r, a_value, p_a_value) # clear wm output WM.flush_all_wm_output() step_diff = abs(step - optimal_steps) diff_sum += step_diff if episode % stat_window == 0: mean_diff = diff_sum / mycount opt_array.append(mean_diff) mycount = 0 diff_sum = 0 # optimal steps plotly.offline.plot({ "data": [Scatter(x=[x for x in range(len(opt_array))], y=opt_array)] })
print(" ") eligibility = np.zeros(lengthHRR) currentLocation = randrange(0, worldSize) workingMemory = 0 currentTask = randint(1, 2) currentSignal = currentTask for timestep in range(1, 100): currentReward = reward[currentSignal, currentLocation] currentState = hrr.convolve( hrr.convolve(world[currentLocation, :], signals[currentTask, :]), memory[workingMemory, :]) currentValue = np.dot(currentState, weights) + bias # store previous information previousLocation = currentLocation previousState = currentState previousWM = workingMemory previousTask = currentTask previousValue = currentValue eligibility = td_lambda * eligibility # -----------------------------------------Working Memory update process---------------------------------------------- # Threshold determines possible candidates for working memory mechanism if stateDimension < candidateThreshold: