def gen_encoders(self, N, contextD, context_scale): """Generate encoders for state population of learning agent. :param N: number of neurons in state population :param contextD: dimension of context vector representation :param context_scale: weight on context representation relative to state (1.0 = equal weighting) """ if contextD > 0: contexts = MU.I(contextD) else: contexts = [[]] # neurons each sensitive to different combinations of stimuli encs = (list(MU.I(self.stateD)) + [o + s + c for o in MU.I(self.num_orientations) for s in MU.I(self.num_shapes) for c in MU.I(self.num_colours)]) return [HRLutils.normalize( HRLutils.normalize(random.choice(encs)) + [x * context_scale for x in random.choice(contexts)]) for _ in range(N)]
def __init__(self, actions, mapname, contextD, context_rewards, **kwargs): """Initialize the environment variables. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param mapname: filename for map file :param contextD: dimension of vector representing context :param context_rewards: mapping from region labels to rewards for being in that region (each entry represents one context) :type context_rewards: dict {"regionlabel":rewardval,...} :param **kwargs: see PlaceCellEnvironment.__init__ """ PlaceCellEnvironment.__init__(self, actions, mapname, name="ContextEnvironment", **kwargs) self.rewards = context_rewards # generate vectors representing each context self.contexts = {} # mapping from region label to context vector for i, r in enumerate(self.rewards): self.contexts[r] = list(MU.I(contextD)[i]) self.context = self.contexts[random.choice(self.contexts.keys())] # randomly pick a new context every context_delay seconds self.context_delay = 60 self.context_update = self.context_delay self.create_origin("placewcontext", lambda: self.place_activations + self.context) self.create_origin("context", lambda: self.context)
def __init__(self, N, d, name="PositiveBias"): """Builds the PositiveBias network. :param N: base number of neurons :param d: dimension of input signal :param name: name for network """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 biaslevel = 0.03 # the value to be output for negative inputs # threshold the input signal to detect positive values nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0, 0.1)) neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]], node_factory=nfac) neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False) # create a population that tries to output biaslevel across # all dimensions bias_input = net.make_input("bias_input", [biaslevel]) bias_pop = net.make_array( "bias_pop", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)]) net.connect(bias_input, bias_pop, pstc=tauPSC) # the individual dimensions of bias_pop are then inhibited by the # output of neg_thresh (so any positive values don't get the bias) net.connect(neg_thresh, bias_pop, pstc=tauPSC, func=lambda x: [1.0] if x[0] > 0 else [0.0], transform=[[-10 if i == k else 0 for k in range(d)] for i in range(d) for _ in range(bias_pop.getNeurons() / d)]) # the whole population is inhibited by the learn signal, so that it # outputs 0 if the system isn't supposed to be learning bias_pop.addTermination("learn", [[-10] for _ in range(bias_pop.getNeurons())], tauPSC, False) self.exposeTermination(neg_thresh.getTermination("input"), "input") self.exposeTermination(bias_pop.getTermination("learn"), "learn") self.exposeOrigin(bias_pop.getOrigin("X"), "X")
def test_actionvalues(): net = nef.Network("testActionValues") stateN = 200 N = 100 stateD = 2 stateradius = 1.0 statelength = math.sqrt(2 * stateradius**2) init_Qs = 0.5 learningrate = 0.0 Qradius = 1 tauPSC = 0.007 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # state state_pop = net.make( "state_pop", stateN, stateD, radius=statelength, node_factory=HRLutils.node_fac(), eval_points=[[x / statelength, y / statelength] for x in range(-int(stateradius), int(stateradius)) for y in range(-int(stateradius), int(stateradius))]) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False) # set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # input inp = net.make_input("input", [0, 0]) net.connect(inp, state_pop.getTermination("state_input")) net.add_to_nengo() net.view()
def gen_encoders(self, N, contextD, context_scale): """Generates encoders for state population in RL agent. State aspect of encoders comes from PlaceCellEnvironment. Context component is a unit vector with contextD dimensions and length context_scale. """ s_encoders = PlaceCellEnvironment.gen_encoders(self, N) c_encoders = [random.choice(MU.I(contextD)) for _ in range(N)] c_encoders = [[x * context_scale for x in enc] for enc in c_encoders] encoders = [s + list(c) for s, c in zip(s_encoders, c_encoders)] encoders = [[x / math.sqrt(sum([y**2 for y in e])) for x in e] for e in encoders] return encoders
def qnetwork(stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None): net = nef.Network("QNetwork") with declarative_syntax(net): N = 50 statelength = math.sqrt(2*stateradius**2) tauPSC = 0.007 num_actions = len(actions) init_Qs = 0.0 weight_save = 600.0 #period to save weights (realtime, not simulation time) #set up relays direct_mode('state_relay', 1, dimension=stateD) add_decoded_termination('state_relay', 'input', MU.I(stateD), .001, False) #create state population ensemble('state_pop', neurons=LIF(stateN), dimensions=stateD, radius=statelength, encoders=state_encoders, ) connect('state_relay', 'state_pop', filter=tauPSC) memory('saved_state', neurons=LIF(N * 4), dimension=stateD, inputscale=50, radius=stateradius, direct_storage=True) # N.B. the "." syntax refers to an ensemble created by the `memory` macro connect('state_relay', 'saved_state.target') ensemble('old_state_pop', neurons=LIF(stateN), dimensions=stateD, radius=statelength, encoders=state_encoders) connect('saved_state', 'old_state_pop', filter=tauPSC) # mess with the intercepts ? for name in 'state_pop', 'old_state_pop': set_intercepts(name, IndicatorPDF(0, 1)) fixMode('state_relay') fixMode('state_pop', ['default', 'rate']) fixMode('old_state_pop', ['default', 'rate'])
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False, label="tmp"): """Runs the model on the Badre et al. (2010) task.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_badreenvironment") env = badreenvironment.BadreEnvironment(flat=flat) net.add(env) # ##NAV AGENT stateN = 500 max_state_input = 3 enc = env.gen_encoders(stateN, 0, 0.0) # generate evaluation points orientations = MU.I(env.num_orientations) shapes = MU.I(env.num_shapes) colours = MU.I(env.num_colours) evals = ( list(MU.diag([3 for _ in range(env.stateD)])) + [o + s + c for o in orientations for s in shapes for c in colours]) # create lower level nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions, name="NavAgent", stateradius=max_state_input, state_encoders=enc, state_evals=evals, discount=0.5, **nav_args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # actions terminate on fixed schedule (aligned with environment) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # ##CTRL AGENT stateN = 500 enc = RandomHypersphereVG().genVectors(stateN, env.stateD) actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions, name="CtrlAgent", state_encoders=enc, stateradius=max_state_input, state_evals=evals, discount=0.4, **ctrl_args) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() net.connect(env.getOrigin("state"), ctrl_agent.getTermination("state_input")) ctrl_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(ctrl_term_node) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # ctrl gets a slight bonus if it selects a rule (as opposed to null), to # encourage it to not just pick null all the time reward_relay = net.make("reward_relay", 1, 3, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay, transform=[[1], [0], [0]]) net.connect(ctrl_agent.getOrigin("action_output"), reward_relay, transform=[[0, 0], [1, 0], [0, 1]]) net.connect(reward_relay, ctrl_agent.getTermination("reward"), func=lambda x: ((x[0] + bias * abs(x[0])) if x[1] + x[2] > 0.5 else x[0]), origin_name="ctrl_reward") # ideal reward function (for testing) # def ctrl_reward_func(x): # if abs(x[0]) < 0.5: # return 0.0 # # if flat: # return 1.5 if x[1] + x[2] < 0.5 else -1.5 # else: # if x[1] + x[2] < 0.5: # return -1.5 # if [round(a) for a in env.state[-2:]] == [round(b) # for b in x[1:]]: # return 1.5 # else: # return -1.5 # net.connect(reward_relay, ctrl_agent.getTermination("reward"), # func=ctrl_reward_func) # nav rewarded for picking ctrl target def nav_reward_func(x): if abs(x[0]) < 0.5 or env.action is None: return 0.0 if x[1] + x[2] < 0.5: return x[0] if x[1] > x[2]: return (1.5 if env.action[1] == env.state[:env.num_orientations] else -1.5) else: return (1.5 if env.action[1] == env.state[env.num_orientations:-env.num_colours] else -1.5) net.connect(reward_relay, nav_agent.getTermination("reward"), func=nav_reward_func) # state for navagent controlled by ctrlagent ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD, radius=2, mode=HRLutils.SIMULATION_MODE) ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \ [[-5, 0]] * 50 * env.num_shapes + \ [[-5, -5]] * 50 * env.num_colours # ctrl output inhibits all the non-selected aspects of the state net.connect(env.getOrigin("state"), ctrl_state_inhib) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib, transform=inhib_matrix) # also give a boost to the selected aspects (so that neurons are roughly # equally activated). def boost_func(x): if x[0] > 0.5: return [3 * v for v in x[1:]] else: return x[1:] boost = net.make("boost", 1, 1 + env.stateD, mode="direct") boost.fixMode() net.connect(ctrl_state_inhib, boost, transform=([[0 for _ in range(env.stateD)]] + list(MU.I(env.stateD)))) net.connect(ctrl_agent.getOrigin("action_output"), boost, transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)]) net.connect(boost, nav_agent.getTermination("state_input"), func=boost_func) # save weights weight_save = 1.0 # period to save weights (realtime, not simulation time) threads = [ HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save), HRLutils.WeightSaveThread( ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)), weight_save) ] for t in threads: t.start() # data collection node data = datanode.DataNode( period=1, filename=HRLutils.datafile("dataoutput_%s.txt" % label), header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat)) print "saving data to", data.filename print "header", data.header net.add(data) nav_q = nav_agent.getNode("QNetwork") ctrl_q = ctrl_agent.getNode("QNetwork") ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions") data.record_avg(env.getOrigin("reward")) data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X")) data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON")) data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON")) data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON")) data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON")) data.record(env.getOrigin("score")) # net.add_to_nengo() # net.network.simulator.run(0, 300, 0.001) net.view() for t in threads: t.stop()
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None): """Runs the model on the delivery task. :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__) :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__) :param tag: string appended to datafiles associated with this run :param seed: random seed used for this run """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED if tag is None: tag = str(seed) net = nef.Network("runDeliveryEnvironment", seed=seed) stateN = 1200 # number of neurons to use in state population contextD = 2 # dimension of context vector context_scale = 1.0 # relative scale of context vector vs state vector max_state_input = 2 # maximum length of input vector to state population # labels and vectors corresponding to basic actions available to the system actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] if "load_weights" in navargs and navargs["load_weights"] is not None: navargs["load_weights"] += "_%s" % tag if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None: ctrlargs["load_weights"] += "_%s" % tag # ##ENVIRONMENT env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # ##NAV AGENT # generate encoders and divide them by max_state_input (so that inputs # will be scaled down to radius 1) enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) # read in eval points from file with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f: evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="NavAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **navargs) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # output of nav_agent is what goes to the environment net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # termination node for nav_agent (just a timer that goes off regularly) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): None}, env, contextD=2, name="NavTermNode") net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) # ##CTRL AGENT # actions corresponding to "go to A" or "go to B" actions = [("a", [0, 1]), ("b", [1, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="CtrlAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **ctrlargs) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() # ctrl_agent gets environmental state and reward net.connect(env.getOrigin("placewcontext"), ctrl_agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), ctrl_agent.getTermination("reward")) # termination node for ctrl_agent (terminates whenever the agent is in the # state targeted by the ctrl_agent) # also has a long timer so that ctrl_agent doesn't get permanently stuck # in one action ctrl_term_node = terminationnode.TerminationNode( { "a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None }, env, contextD=2, name="CtrlTermNode", rewardval=1.5) net.add(ctrl_term_node) # reward for nav_agent is the pseudoreward from ctrl_agent termination net.connect(ctrl_term_node.getOrigin("pseudoreward"), nav_agent.getTermination("reward")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # connect ctrl_agent action to termination context # this is used so that ctrl_term_node knows what the current goal is (to # determine termination and pseudoreward) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_term_node.getTermination("context")) # state input for nav_agent is the environmental state + the output of # ctrl_agent ctrl_output_relay = net.make("ctrl_output_relay", 1, len(env.placecells) + contextD, mode="direct") ctrl_output_relay.fixMode() trans = (list(MU.I(len(env.placecells))) + [[0 for _ in range(len(env.placecells))] for _ in range(contextD)]) net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_output_relay, transform=([[0 for _ in range(contextD)] for _ in range(len(env.placecells))] + list(MU.I(contextD)))) net.connect(ctrl_output_relay, nav_agent.getTermination("state_input")) # periodically save the weights # period to save weights (realtime, not simulation time) weight_save = 600.0 threads = [ HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, tag)), weight_save), HRLutils.WeightSaveThread( ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, tag)), weight_save) ] for t in threads: t.start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % tag)) net.add(data) data.record(env.getOrigin("reward")) q_net = ctrl_agent.getNode("QNetwork") data.record(q_net.getNode("actionvals").getOrigin("X"), func=max) data.record(q_net.getNode("actionvals").getOrigin("X"), func=min) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) # net.add_to_nengo() # net.run(10000) net.view() for t in threads: t.stop()
def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3): """Builds the ErrorNetwork. :param num_actions: the number of actions available to the system :param Qradius: expected radius of Q values :param rewardradius: expected radius of reward signal :param discount: discount factor """ self.name = "ErrorNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 errorcap = 0.1 # soft cap on error magnitude (large errors seem to # cause problems with overly-generalizing the learning) # set up relays vals_relay = net.make("vals_relay", 1, num_actions, mode="direct") vals_relay.fixMode() vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct") old_vals_relay.fixMode() old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct") curr_bg_relay.fixMode() curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct") saved_bg_relay.fixMode() saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) # select out only the currently chosen Q value gatedQ = net.make_array("gatedQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(vals_relay, gatedQ, pstc=tauPSC) net.connect( curr_bg_relay, gatedQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)], pstc=tauPSC) currQ = net.make("currQ", 1, 1, mode="direct") currQ.fixMode() net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # select out only the previously chosen Q value gatedstoreQ = net.make_array("gatedstoreQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC) net.connect( saved_bg_relay, gatedstoreQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)], pstc=tauPSC) storeQ = net.make("storeQ", 1, 1, mode="direct") storeQ.fixMode() net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # create error calculation network error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius) net.add(error) net.connect(currQ, error.getTermination("currQ")) net.connect(storeQ, error.getTermination("storeQ")) # gate error by learning signal and saved BG output (we only want error # when the system is supposed to be learning, and we only want error # related to the action that was selected) gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac()) gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC) # scale the error by Qradius, so that we don't get super huge errors # (causes problems with the gating) learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac()) learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False) net.connect(learninggate, gatederror, func=lambda x: [1.0], transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC) net.connect( saved_bg_relay, gatederror, transform=[[-12 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)], pstc=tauPSC) # add a positive bias to the error anywhere the Q values are negative # (to stop Q values from getting too negative, which causes problems # with the action selection) posbias = positivebias.PositiveBias(N, num_actions) net.add(posbias) net.connect(old_vals_relay, posbias.getTermination("input")) net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0]) biasederror = net.make("biasederror", 1, num_actions, mode="direct") biasederror.fixMode() net.connect(gatederror, biasederror, pstc=0.001) net.connect(posbias, biasederror, pstc=0.001) self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input") self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input") self.exposeTermination(vals_relay.getTermination("input"), "vals") self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals") self.exposeTermination(error.getTermination("reward"), "reward") self.exposeTermination(error.getTermination("reset"), "reset") self.exposeTermination(learninggate.getTermination("gate"), "learn") self.exposeOrigin(biasederror.getOrigin("X"), "error")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=(0.0, 1.0), statediff_threshold=0.2, init_Qs=None): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold range of state neurons :param statediff_threshold: maximum state difference for dual training :param init_Qs: initial Q values """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs # if True, use neuron--neuron weight learning, otherwise, use decoder # learning self.neuron_learning = False # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() if isinstance(state_threshold, (float, int)): state_threshold = (state_threshold, 1.0) state_fac.setIntercept( IndicatorPDF(state_threshold[0], state_threshold[1])) state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous # state) saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode( state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of # the script when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode( old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights is not None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used # to drive learning on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) # doubling the values to get a bigger error signal net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # calculate diff between curr_state and saved_state and use that to # gate valdiff (we only want to train the curr state based on previous # state when the two have similar values) # note: threshold > 0 so that there is a deadzone in the middle (when # the states are similar) where there will be no output inhibition statediff = net.make_array("statediff", N, stateD, intercept=(statediff_threshold, 1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and # expose the error signal for the previous Q values to the external # error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def __init__(self, name, N, d, scale=1.0, weights=None, maxinput=1.0, oneDinput=False): # scale is a scale on the output of the multiplication # output = (input1.*input2)*scale # weights are optional matrices applied to each input # output = (C1*input1 .* C2*input2)*scale # maxinput is the maximum expected value of any dimension of the # inputs. this is used to scale the inputs internally so that the # length of the vectors in the intermediate populations are not # too small (which results in a lot of noise in the calculations) # oneDinput indicates that the second input is one dimensional, and is # just a scale on the first input rather than an element-wise product self.name = name tauPSC = 0.007 # the size of the intermediate populations smallN = int(math.ceil(float(N) / d)) # the maximum value of the vectors represented by the intermediate # populations. the vector is at most [maxinput maxinput], so the length # of that is sqrt(maxinput**2 + maxinput**2) maxlength = math.sqrt(2 * maxinput**2) if weights is not None and len(weights) != 2: print "Warning, other than 2 matrices given to eprod" if weights is None: weights = [MU.I(d), MU.I(d)] inputd = len(weights[0][0]) ef = HRLutils.defaultEnsembleFactory() # create input populations in1 = ef.make("in1", 1, inputd) in1.addDecodedTermination("input", MU.I(inputd), 0.001, False) self.addNode(in1) in1.setMode(SimulationMode.DIRECT) # since this is just a relay in1.fixMode() in2 = ef.make("in2", 1, inputd) if not oneDinput: in2.addDecodedTermination("input", MU.I(inputd), 0.001, False) else: # if it is a 1-D input we just expand it to a full vector of that # value so that we can treat it as an element-wise product in2.addDecodedTermination("input", [[1] for i in range(inputd)], 0.001, False) self.addNode(in2) in2.setMode(SimulationMode.DIRECT) # since this is just a relay in2.fixMode() # ensemble for intermediate populations multef = NEFEnsembleFactoryImpl() multef.nodeFactory.tauRC = 0.05 multef.nodeFactory.tauRef = 0.002 multef.nodeFactory.maxRate = IndicatorPDF(200, 500) multef.nodeFactory.intercept = IndicatorPDF(-1, 1) multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator( ) multef.beQuiet() result = ef.make("result", 1, d) result.setMode(SimulationMode.DIRECT) # since this is just a relay result.fixMode() self.addNode(result) resultTerm = [[0] for _ in range(d)] zeros = [0 for _ in range(inputd)] for e in range(d): # create a 2D population for each input dimension which will # combine the components from one dimension of each of the input # populations mpop = multef.make('mpop_' + str(e), smallN, 2) # make two connection that will select one component from each of # the input pops # we divide by maxlength to ensure that the maximum length of the # 2D vector is 1 # remember that (for some reason) the convention in Nengo is that # the input matrices are transpose of what they would be # mathematically mpop.addDecodedTermination('a', [[(1.0 / maxlength) * weights[0][e][i] for i in range(inputd)], zeros], tauPSC, False) mpop.addDecodedTermination('b', [ zeros, [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)] ], tauPSC, False) # multiply the two selected components together mpop.addDecodedOrigin("output", [PostfixFunction('x0*x1', 2)], "AXON") self.addNode(mpop) self.addProjection(in1.getOrigin('X'), mpop.getTermination('a')) self.addProjection(in2.getOrigin('X'), mpop.getTermination('b')) # combine the 1D results back into one vector. # we scaled each input by 1/maxlength, then multiplied them # together for a total scale of 1/maxlength**2, so to undo we # multiply by maxlength**2 resultTerm[e] = [maxlength**2 * scale] result.addDecodedTermination('in_' + str(e), resultTerm, 0.001, False) resultTerm[e] = [0] self.addProjection(mpop.getOrigin('output'), result.getTermination('in_' + str(e))) self.exposeTermination(in1.getTermination("input"), "A") self.exposeTermination(in2.getTermination("input"), "B") self.exposeOrigin(result.getOrigin("X"), "X")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None): NetworkImpl.__init__(self) self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 statelength = math.sqrt(2*stateradius**2) tauPSC = 0.007 num_actions = len(actions) init_Qs = 0.0 weight_save = 600.0 #period to save weights (realtime, not simulation time) #set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) #create state population state_fac = HRLutils.node_fac() state_fac.setIntercept(IndicatorPDF(0,1)) state_pop = net.make("state_pop", stateN, stateD, radius=statelength, node_factory=state_fac, encoders=state_encoders) # eval_points=MU.I(stateD)) # state_pop = net.make_array("state_pop", stateN/stateD, stateD, # node_factory=state_fac) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) #create population tied to previous state (to be used in learning) saved_state = memory.Memory("saved_state", N*4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) old_state_pop = net.make("old_state_pop", stateN, stateD, radius=statelength, node_factory=state_fac, encoders=state_encoders) # eval_points=MU.I(stateD)) # old_state_pop = net.make_array("old_state_pop", stateN/stateD, stateD, # node_factory=state_fac) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) #set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) if load_weights != None: self.loadWeights(load_weights) #find error between old_actionvals and actionvals valdiff = net.make_array("valdiff", N, num_actions, node_factory = HRLutils.node_fac()) net.connect(old_actionvals, valdiff, transform=MU.diag([2]*num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2]*num_actions), pstc=tauPSC) #doubling values to get a bigger error signal #calculate diff between curr_state and saved_state and use that to gate valdiff statediff = net.make_array("statediff", N, stateD, intercept=(0.2,1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1]*stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform = [[-10]*stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) net.connect(valdiff, actionvals.getTermination("error")) #periodically save the weights class WeightSaveThread(threading.Thread): def __init__(self, func, prefix, period): threading.Thread.__init__(self) self.func = func self.prefix = prefix self.period = period def run(self): while True: time.sleep(self.period) self.func(self.prefix) wsn = WeightSaveThread(self.saveWeights, os.path.join("weights","tmp"), weight_save) wsn.start() self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(old_actionvals.getTermination("error"), "error") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")