Exemplo n.º 1
0
    def gen_encoders(self, N, contextD, context_scale):
        """Generate encoders for state population of learning agent.

        :param N: number of neurons in state population
        :param contextD: dimension of context vector representation
        :param context_scale: weight on context representation relative to
            state (1.0 = equal weighting)
        """

        if contextD > 0:
            contexts = MU.I(contextD)
        else:
            contexts = [[]]

        # neurons each sensitive to different combinations of stimuli
        encs = (list(MU.I(self.stateD)) +
                [o + s + c
                 for o in MU.I(self.num_orientations)
                 for s in MU.I(self.num_shapes)
                 for c in MU.I(self.num_colours)])

        return [HRLutils.normalize(
            HRLutils.normalize(random.choice(encs)) +
            [x * context_scale for x in random.choice(contexts)])
            for _ in range(N)]
Exemplo n.º 2
0
    def __init__(self, actions, mapname, contextD, context_rewards, **kwargs):
        """Initialize the environment variables.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param mapname: filename for map file
        :param contextD: dimension of vector representing context
        :param context_rewards: mapping from region labels to rewards for being
            in that region (each entry represents one context)
            :type context_rewards: dict {"regionlabel":rewardval,...}
        :param **kwargs: see PlaceCellEnvironment.__init__
        """

        PlaceCellEnvironment.__init__(self,
                                      actions,
                                      mapname,
                                      name="ContextEnvironment",
                                      **kwargs)

        self.rewards = context_rewards

        # generate vectors representing each context
        self.contexts = {}  # mapping from region label to context vector
        for i, r in enumerate(self.rewards):
            self.contexts[r] = list(MU.I(contextD)[i])

        self.context = self.contexts[random.choice(self.contexts.keys())]

        # randomly pick a new context every context_delay seconds
        self.context_delay = 60
        self.context_update = self.context_delay

        self.create_origin("placewcontext",
                           lambda: self.place_activations + self.context)
        self.create_origin("context", lambda: self.context)
Exemplo n.º 3
0
    def __init__(self, N, d, name="PositiveBias"):
        """Builds the PositiveBias network.

        :param N: base number of neurons
        :param d: dimension of input signal
        :param name: name for network
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        biaslevel = 0.03  # the value to be output for negative inputs

        # threshold the input signal to detect positive values
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0, 0.1))
        neg_thresh = net.make_array("neg_thresh",
                                    N,
                                    d,
                                    encoders=[[1]],
                                    node_factory=nfac)
        neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False)

        # create a population that tries to output biaslevel across
        # all dimensions
        bias_input = net.make_input("bias_input", [biaslevel])
        bias_pop = net.make_array(
            "bias_pop",
            N,
            d,
            node_factory=HRLutils.node_fac(),
            eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)])

        net.connect(bias_input, bias_pop, pstc=tauPSC)

        # the individual dimensions of bias_pop are then inhibited by the
        # output of neg_thresh (so any positive values don't get the bias)
        net.connect(neg_thresh,
                    bias_pop,
                    pstc=tauPSC,
                    func=lambda x: [1.0] if x[0] > 0 else [0.0],
                    transform=[[-10 if i == k else 0 for k in range(d)]
                               for i in range(d)
                               for _ in range(bias_pop.getNeurons() / d)])

        # the whole population is inhibited by the learn signal, so that it
        # outputs 0 if the system isn't supposed to be learning
        bias_pop.addTermination("learn",
                                [[-10] for _ in range(bias_pop.getNeurons())],
                                tauPSC, False)

        self.exposeTermination(neg_thresh.getTermination("input"), "input")
        self.exposeTermination(bias_pop.getTermination("learn"), "learn")
        self.exposeOrigin(bias_pop.getOrigin("X"), "X")
Exemplo n.º 4
0
def test_actionvalues():
    net = nef.Network("testActionValues")

    stateN = 200
    N = 100
    stateD = 2
    stateradius = 1.0
    statelength = math.sqrt(2 * stateradius**2)
    init_Qs = 0.5
    learningrate = 0.0
    Qradius = 1
    tauPSC = 0.007
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    # state
    state_pop = net.make(
        "state_pop",
        stateN,
        stateD,
        radius=statelength,
        node_factory=HRLutils.node_fac(),
        eval_points=[[x / statelength, y / statelength]
                     for x in range(-int(stateradius), int(stateradius))
                     for y in range(-int(stateradius), int(stateradius))])
    state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
    state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False)

    # set up action nodes
    decoders = state_pop.addDecodedOrigin("init_decoders",
                                          [ConstantFunction(stateD, init_Qs)],
                                          "AXON").getDecoders()

    actionvals = actionvalues.ActionValues("testActionValues",
                                           N,
                                           stateN,
                                           actions,
                                           learningrate,
                                           Qradius=Qradius,
                                           init_decoders=decoders)
    net.add(actionvals)

    net.connect(state_pop.getOrigin("AXON"),
                actionvals.getTermination("state"))

    # input
    inp = net.make_input("input", [0, 0])
    net.connect(inp, state_pop.getTermination("state_input"))

    net.add_to_nengo()
    net.view()
Exemplo n.º 5
0
    def gen_encoders(self, N, contextD, context_scale):
        """Generates encoders for state population in RL agent.

        State aspect of encoders comes from PlaceCellEnvironment. Context
        component is a unit vector with contextD dimensions and length
        context_scale.
        """

        s_encoders = PlaceCellEnvironment.gen_encoders(self, N)
        c_encoders = [random.choice(MU.I(contextD)) for _ in range(N)]
        c_encoders = [[x * context_scale for x in enc] for enc in c_encoders]
        encoders = [s + list(c) for s, c in zip(s_encoders, c_encoders)]
        encoders = [[x / math.sqrt(sum([y**2 for y in e])) for x in e]
                    for e in encoders]
        return encoders
Exemplo n.º 6
0
def qnetwork(stateN, stateD, state_encoders, actions, learningrate,
                 stateradius=1.0, Qradius=1.0, load_weights=None):
    net = nef.Network("QNetwork")
    with declarative_syntax(net):
        N = 50
        statelength = math.sqrt(2*stateradius**2)
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = 0.0
        weight_save = 600.0 #period to save weights (realtime, not simulation time)

        #set up relays
        direct_mode('state_relay', 1, dimension=stateD)
        add_decoded_termination('state_relay', 'input', MU.I(stateD), .001, False)

        #create state population
        ensemble('state_pop', neurons=LIF(stateN),
                 dimensions=stateD,
                 radius=statelength,
                 encoders=state_encoders,
                )
        connect('state_relay', 'state_pop', filter=tauPSC)

        memory('saved_state', neurons=LIF(N * 4), dimension=stateD,
               inputscale=50,
               radius=stateradius,
               direct_storage=True)

        # N.B. the "." syntax refers to an ensemble created by the `memory` macro
        connect('state_relay', 'saved_state.target')

        ensemble('old_state_pop', neurons=LIF(stateN),
                 dimensions=stateD,
                 radius=statelength,
                 encoders=state_encoders)

        connect('saved_state', 'old_state_pop', filter=tauPSC)

        # mess with the intercepts ?
        for name in 'state_pop', 'old_state_pop':
            set_intercepts(name, IndicatorPDF(0, 1))

        fixMode('state_relay')
        fixMode('state_pop', ['default', 'rate'])
        fixMode('old_state_pop', ['default', 'rate'])
Exemplo n.º 7
0
def run_badreenvironment(nav_args,
                         ctrl_args,
                         bias=0.0,
                         seed=None,
                         flat=False,
                         label="tmp"):
    """Runs the model on the Badre et al. (2010) task."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_badreenvironment")

    env = badreenvironment.BadreEnvironment(flat=flat)
    net.add(env)

    # ##NAV AGENT
    stateN = 500
    max_state_input = 3
    enc = env.gen_encoders(stateN, 0, 0.0)

    # generate evaluation points
    orientations = MU.I(env.num_orientations)
    shapes = MU.I(env.num_shapes)
    colours = MU.I(env.num_colours)
    evals = (
        list(MU.diag([3 for _ in range(env.stateD)])) +
        [o + s + c for o in orientations for s in shapes for c in colours])

    # create lower level
    nav_agent = smdpagent.SMDPAgent(stateN,
                                    env.stateD,
                                    env.actions,
                                    name="NavAgent",
                                    stateradius=max_state_input,
                                    state_encoders=enc,
                                    state_evals=evals,
                                    discount=0.5,
                                    **nav_args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # actions terminate on fixed schedule (aligned with environment)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="NavTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # ##CTRL AGENT
    stateN = 500
    enc = RandomHypersphereVG().genVectors(stateN, env.stateD)
    actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN,
                                     env.stateD,
                                     actions,
                                     name="CtrlAgent",
                                     state_encoders=enc,
                                     stateradius=max_state_input,
                                     state_evals=evals,
                                     discount=0.4,
                                     **ctrl_args)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    net.connect(env.getOrigin("state"),
                ctrl_agent.getTermination("state_input"))

    ctrl_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="CtrlTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(ctrl_term_node)

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # ctrl gets a slight bonus if it selects a rule (as opposed to null), to
    # encourage it to not just pick null all the time
    reward_relay = net.make("reward_relay", 1, 3, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"),
                reward_relay,
                transform=[[1], [0], [0]])
    net.connect(ctrl_agent.getOrigin("action_output"),
                reward_relay,
                transform=[[0, 0], [1, 0], [0, 1]])

    net.connect(reward_relay,
                ctrl_agent.getTermination("reward"),
                func=lambda x: ((x[0] + bias * abs(x[0]))
                                if x[1] + x[2] > 0.5 else x[0]),
                origin_name="ctrl_reward")

    # ideal reward function (for testing)
    #     def ctrl_reward_func(x):
    #         if abs(x[0]) < 0.5:
    #             return 0.0
    #
    #         if flat:
    #             return 1.5 if x[1] + x[2] < 0.5 else -1.5
    #         else:
    #             if x[1] + x[2] < 0.5:
    #                 return -1.5
    #             if [round(a) for a in env.state[-2:]] == [round(b)
    #                                                       for b in x[1:]]:
    #                 return 1.5
    #             else:
    #                 return -1.5
    #     net.connect(reward_relay, ctrl_agent.getTermination("reward"),
    #                 func=ctrl_reward_func)

    # nav rewarded for picking ctrl target
    def nav_reward_func(x):
        if abs(x[0]) < 0.5 or env.action is None:
            return 0.0

        if x[1] + x[2] < 0.5:
            return x[0]

        if x[1] > x[2]:
            return (1.5 if env.action[1] == env.state[:env.num_orientations]
                    else -1.5)
        else:
            return (1.5 if env.action[1]
                    == env.state[env.num_orientations:-env.num_colours] else
                    -1.5)

    net.connect(reward_relay,
                nav_agent.getTermination("reward"),
                func=nav_reward_func)

    # state for navagent controlled by ctrlagent
    ctrl_state_inhib = net.make_array("ctrl_state_inhib",
                                      50,
                                      env.stateD,
                                      radius=2,
                                      mode=HRLutils.SIMULATION_MODE)
    ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

    inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \
                   [[-5, 0]] * 50 * env.num_shapes + \
                   [[-5, -5]] * 50 * env.num_colours

    # ctrl output inhibits all the non-selected aspects of the state
    net.connect(env.getOrigin("state"), ctrl_state_inhib)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_state_inhib,
                transform=inhib_matrix)

    # also give a boost to the selected aspects (so that neurons are roughly
    # equally activated).
    def boost_func(x):
        if x[0] > 0.5:
            return [3 * v for v in x[1:]]
        else:
            return x[1:]

    boost = net.make("boost", 1, 1 + env.stateD, mode="direct")
    boost.fixMode()
    net.connect(ctrl_state_inhib,
                boost,
                transform=([[0 for _ in range(env.stateD)]] +
                           list(MU.I(env.stateD))))
    net.connect(ctrl_agent.getOrigin("action_output"),
                boost,
                transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)])

    net.connect(boost,
                nav_agent.getTermination("state_input"),
                func=boost_func)

    # save weights
    weight_save = 1.0  # period to save weights (realtime, not simulation time)
    threads = [
        HRLutils.WeightSaveThread(
            nav_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (nav_agent.name, seed)),
            weight_save),
        HRLutils.WeightSaveThread(
            ctrl_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)),
            weight_save)
    ]
    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(
        period=1,
        filename=HRLutils.datafile("dataoutput_%s.txt" % label),
        header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat))
    print "saving data to", data.filename
    print "header", data.header
    net.add(data)
    nav_q = nav_agent.getNode("QNetwork")
    ctrl_q = ctrl_agent.getNode("QNetwork")
    ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON"))
    data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))
    data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON"))
    data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON"))
    data.record(env.getOrigin("score"))

    #     net.add_to_nengo()
    #     net.network.simulator.run(0, 300, 0.001)
    net.view()

    for t in threads:
        t.stop()
Exemplo n.º 8
0
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None):
    """Runs the model on the delivery task.

    :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__)
    :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__)
    :param tag: string appended to datafiles associated with this run
    :param seed: random seed used for this run
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    if tag is None:
        tag = str(seed)

    net = nef.Network("runDeliveryEnvironment", seed=seed)

    stateN = 1200  # number of neurons to use in state population
    contextD = 2  # dimension of context vector
    context_scale = 1.0  # relative scale of context vector vs state vector
    max_state_input = 2  # maximum length of input vector to state population

    # labels and vectors corresponding to basic actions available to the system
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    if "load_weights" in navargs and navargs["load_weights"] is not None:
        navargs["load_weights"] += "_%s" % tag
    if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None:
        ctrlargs["load_weights"] += "_%s" % tag

    # ##ENVIRONMENT

    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # ##NAV AGENT

    # generate encoders and divide them by max_state_input (so that inputs
    # will be scaled down to radius 1)
    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    # read in eval points from file
    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f:
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    nav_agent = smdpagent.SMDPAgent(stateN,
                                    len(env.placecells) + contextD,
                                    actions,
                                    name="NavAgent",
                                    state_encoders=enc,
                                    state_evals=evals,
                                    state_threshold=0.8,
                                    **navargs)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # output of nav_agent is what goes to the environment
    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # termination node for nav_agent (just a timer that goes off regularly)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): None},
        env,
        contextD=2,
        name="NavTermNode")
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    # ##CTRL AGENT

    # actions corresponding to "go to A" or "go to B"
    actions = [("a", [0, 1]), ("b", [1, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN,
                                     len(env.placecells) + contextD,
                                     actions,
                                     name="CtrlAgent",
                                     state_encoders=enc,
                                     state_evals=evals,
                                     state_threshold=0.8,
                                     **ctrlargs)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    # ctrl_agent gets environmental state and reward
    net.connect(env.getOrigin("placewcontext"),
                ctrl_agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), ctrl_agent.getTermination("reward"))

    # termination node for ctrl_agent (terminates whenever the agent is in the
    # state targeted by the ctrl_agent)
    # also has a long timer so that ctrl_agent doesn't get permanently stuck
    # in one action
    ctrl_term_node = terminationnode.TerminationNode(
        {
            "a": [0, 1],
            "b": [1, 0],
            terminationnode.Timer((30, 30)): None
        },
        env,
        contextD=2,
        name="CtrlTermNode",
        rewardval=1.5)
    net.add(ctrl_term_node)

    # reward for nav_agent is the pseudoreward from ctrl_agent termination
    net.connect(ctrl_term_node.getOrigin("pseudoreward"),
                nav_agent.getTermination("reward"))

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # connect ctrl_agent action to termination context
    # this is used so that ctrl_term_node knows what the current goal is (to
    # determine termination and pseudoreward)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_term_node.getTermination("context"))

    # state input for nav_agent is the environmental state + the output of
    # ctrl_agent
    ctrl_output_relay = net.make("ctrl_output_relay",
                                 1,
                                 len(env.placecells) + contextD,
                                 mode="direct")
    ctrl_output_relay.fixMode()
    trans = (list(MU.I(len(env.placecells))) +
             [[0 for _ in range(len(env.placecells))]
              for _ in range(contextD)])
    net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_output_relay,
                transform=([[0 for _ in range(contextD)]
                            for _ in range(len(env.placecells))] +
                           list(MU.I(contextD))))

    net.connect(ctrl_output_relay, nav_agent.getTermination("state_input"))

    # periodically save the weights

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0

    threads = [
        HRLutils.WeightSaveThread(
            nav_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (nav_agent.name, tag)),
            weight_save),
        HRLutils.WeightSaveThread(
            ctrl_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (ctrl_agent.name, tag)),
            weight_save)
    ]

    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        tag))
    net.add(data)
    data.record(env.getOrigin("reward"))
    q_net = ctrl_agent.getNode("QNetwork")
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=max)
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=min)
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))

    #     net.add_to_nengo()
    #     net.run(10000)
    net.view()

    for t in threads:
        t.stop()
Exemplo n.º 9
0
    def __init__(self,
                 num_actions,
                 Qradius=1.0,
                 rewardradius=1.0,
                 discount=0.3):
        """Builds the ErrorNetwork.

        :param num_actions: the number of actions available to the system
        :param Qradius: expected radius of Q values
        :param rewardradius: expected radius of reward signal
        :param discount: discount factor
        """

        self.name = "ErrorNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        errorcap = 0.1  # soft cap on error magnitude (large errors seem to
        # cause problems with overly-generalizing the learning)

        # set up relays
        vals_relay = net.make("vals_relay", 1, num_actions, mode="direct")
        vals_relay.fixMode()
        vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                         False)

        old_vals_relay = net.make("old_vals_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        old_vals_relay.fixMode()
        old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        curr_bg_relay = net.make("curr_bg_relay",
                                 1,
                                 num_actions,
                                 mode="direct")
        curr_bg_relay.fixMode()
        curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                            False)

        saved_bg_relay = net.make("saved_bg_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        saved_bg_relay.fixMode()
        saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        # select out only the currently chosen Q value
        gatedQ = net.make_array("gatedQ",
                                N * 2,
                                num_actions,
                                node_factory=HRLutils.node_fac(),
                                radius=Qradius)
        gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(vals_relay, gatedQ, pstc=tauPSC)

        net.connect(
            curr_bg_relay,
            gatedQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        currQ = net.make("currQ", 1, 1, mode="direct")
        currQ.fixMode()
        net.connect(gatedQ,
                    currQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # select out only the previously chosen Q value
        gatedstoreQ = net.make_array("gatedstoreQ",
                                     N * 2,
                                     num_actions,
                                     node_factory=HRLutils.node_fac(),
                                     radius=Qradius)
        gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatedstoreQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedstoreQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        storeQ = net.make("storeQ", 1, 1, mode="direct")
        storeQ.fixMode()
        net.connect(gatedstoreQ,
                    storeQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # create error calculation network
        error = errorcalc2.ErrorCalc2(discount,
                                      rewardradius=rewardradius,
                                      Qradius=Qradius)
        net.add(error)

        net.connect(currQ, error.getTermination("currQ"))
        net.connect(storeQ, error.getTermination("storeQ"))

        # gate error by learning signal and saved BG output (we only want error
        # when the system is supposed to be learning, and we only want error
        # related to the action that was selected)
        gatederror = net.make_array("gatederror",
                                    N * 2,
                                    num_actions,
                                    radius=errorcap,
                                    node_factory=HRLutils.node_fac())
        gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(error,
                    gatederror,
                    transform=[[1.0 / Qradius] for _ in range(num_actions)],
                    pstc=tauPSC)
        # scale the error by Qradius, so that we don't get super huge errors
        # (causes problems with the gating)

        learninggate = net.make("learninggate",
                                N,
                                1,
                                node_factory=HRLutils.node_fac())
        learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC,
                                    False)

        net.connect(learninggate,
                    gatederror,
                    func=lambda x: [1.0],
                    transform=[[-12] for _ in range(gatederror.getNeurons())],
                    pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatederror,
            transform=[[-12 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatederror.getNeurons() / num_actions)],
            pstc=tauPSC)

        # add a positive bias to the error anywhere the Q values are negative
        # (to stop Q values from getting too negative, which causes problems
        # with the action selection)
        posbias = positivebias.PositiveBias(N, num_actions)
        net.add(posbias)
        net.connect(old_vals_relay, posbias.getTermination("input"))
        net.connect(learninggate,
                    posbias.getTermination("learn"),
                    func=lambda x: [1.0])

        biasederror = net.make("biasederror", 1, num_actions, mode="direct")
        biasederror.fixMode()
        net.connect(gatederror, biasederror, pstc=0.001)
        net.connect(posbias, biasederror, pstc=0.001)

        self.exposeTermination(curr_bg_relay.getTermination("input"),
                               "curr_bg_input")
        self.exposeTermination(saved_bg_relay.getTermination("input"),
                               "saved_bg_input")
        self.exposeTermination(vals_relay.getTermination("input"), "vals")
        self.exposeTermination(old_vals_relay.getTermination("input"),
                               "old_vals")
        self.exposeTermination(error.getTermination("reward"), "reward")
        self.exposeTermination(error.getTermination("reset"), "reset")
        self.exposeTermination(learninggate.getTermination("gate"), "learn")
        self.exposeOrigin(biasederror.getOrigin("X"), "error")
Exemplo n.º 10
0
    def __init__(self,
                 stateN,
                 stateD,
                 state_encoders,
                 actions,
                 learningrate,
                 stateradius=1.0,
                 Qradius=1.0,
                 load_weights=None,
                 state_evals=None,
                 state_threshold=(0.0, 1.0),
                 statediff_threshold=0.2,
                 init_Qs=None):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if
            the input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold range of state neurons
        :param statediff_threshold: maximum state difference for dual training
        :param init_Qs: initial Q values
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs

        # if True, use neuron--neuron weight learning, otherwise, use decoder
        # learning
        self.neuron_learning = False

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        if isinstance(state_threshold, (float, int)):
            state_threshold = (state_threshold, 1.0)
        state_fac.setIntercept(
            IndicatorPDF(state_threshold[0], state_threshold[1]))

        state_pop = net.make("state_pop",
                             stateN,
                             stateD,
                             radius=stateradius,
                             node_factory=state_fac,
                             encoders=state_encoders,
                             eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous
        # state)
        saved_state = memory.Memory("saved_state",
                                    N * 4,
                                    stateD,
                                    inputscale=50,
                                    radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop",
                                 stateN,
                                 stateD,
                                 radius=stateradius,
                                 node_factory=state_fac,
                                 encoders=state_encoders,
                                 eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals",
                                                   N,
                                                   stateN,
                                                   actions,
                                                   learningrate,
                                                   Qradius=Qradius,
                                                   init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"),
                        actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals",
                                                       N,
                                                       stateN,
                                                       actions,
                                                       learningrate,
                                                       Qradius=Qradius,
                                                       init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"),
                        old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(
                state_pop,
                origin,
                learningrate,
                num_actions,
                name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of
            # the script when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(
                old_state_pop,
                origin,
                learningrate,
                num_actions,
                name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals",
                                      1,
                                      num_actions,
                                      mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights is not None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used
        # to drive learning on the new actionvals)
        valdiff = net.make_array("valdiff",
                                 N,
                                 num_actions,
                                 node_factory=HRLutils.node_fac())
        # doubling the values to get a bigger error signal
        net.connect(old_actionvals,
                    valdiff,
                    transform=MU.diag([2] * num_actions),
                    pstc=tauPSC)
        net.connect(actionvals,
                    valdiff,
                    transform=MU.diag([-2] * num_actions),
                    pstc=tauPSC)

        # calculate diff between curr_state and saved_state and use that to
        # gate valdiff (we only want to train the curr state based on previous
        # state when the two have similar values)
        # note: threshold > 0 so that there is a deadzone in the middle (when
        # the states are similar) where there will be no output inhibition
        statediff = net.make_array("statediff",
                                   N,
                                   stateD,
                                   intercept=(statediff_threshold, 1))

        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state,
                    statediff,
                    transform=MU.diag([-1] * stateD),
                    pstc=tauPSC)

        net.connect(statediff,
                    valdiff,
                    func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD
                               for _ in range(valdiff.getNeurons())],
                    pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and
        # expose the error signal for the previous Q values to the external
        # error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"),
                                   "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"),
                                   "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"),
                               "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
Exemplo n.º 11
0
    def __init__(self,
                 name,
                 N,
                 d,
                 scale=1.0,
                 weights=None,
                 maxinput=1.0,
                 oneDinput=False):
        # scale is a scale on the output of the multiplication
        # output = (input1.*input2)*scale

        # weights are optional matrices applied to each input
        # output = (C1*input1 .* C2*input2)*scale

        # maxinput is the maximum expected value of any dimension of the
        # inputs. this is used to scale the inputs internally so that the
        # length of the vectors in the intermediate populations are not
        # too small (which results in a lot of noise in the calculations)

        # oneDinput indicates that the second input is one dimensional, and is
        # just a scale on the first input rather than an element-wise product

        self.name = name
        tauPSC = 0.007

        # the size of the intermediate populations
        smallN = int(math.ceil(float(N) / d))

        # the maximum value of the vectors represented by the intermediate
        # populations. the vector is at most [maxinput maxinput], so the length
        # of that is sqrt(maxinput**2 + maxinput**2)
        maxlength = math.sqrt(2 * maxinput**2)

        if weights is not None and len(weights) != 2:
            print "Warning, other than 2 matrices given to eprod"

        if weights is None:
            weights = [MU.I(d), MU.I(d)]

        inputd = len(weights[0][0])

        ef = HRLutils.defaultEnsembleFactory()

        # create input populations
        in1 = ef.make("in1", 1, inputd)
        in1.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        self.addNode(in1)
        in1.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in1.fixMode()

        in2 = ef.make("in2", 1, inputd)
        if not oneDinput:
            in2.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        else:
            # if it is a 1-D input we just expand it to a full vector of that
            # value so that we can treat it as an element-wise product
            in2.addDecodedTermination("input", [[1] for i in range(inputd)],
                                      0.001, False)
        self.addNode(in2)
        in2.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in2.fixMode()

        # ensemble for intermediate populations
        multef = NEFEnsembleFactoryImpl()
        multef.nodeFactory.tauRC = 0.05
        multef.nodeFactory.tauRef = 0.002
        multef.nodeFactory.maxRate = IndicatorPDF(200, 500)
        multef.nodeFactory.intercept = IndicatorPDF(-1, 1)
        multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator(
        )
        multef.beQuiet()

        result = ef.make("result", 1, d)
        result.setMode(SimulationMode.DIRECT)  # since this is just a relay
        result.fixMode()
        self.addNode(result)

        resultTerm = [[0] for _ in range(d)]
        zeros = [0 for _ in range(inputd)]

        for e in range(d):
            # create a 2D population for each input dimension which will
            # combine the components from one dimension of each of the input
            # populations
            mpop = multef.make('mpop_' + str(e), smallN, 2)

            # make two connection that will select one component from each of
            # the input pops
            # we divide by maxlength to ensure that the maximum length of the
            # 2D vector is 1
            # remember that (for some reason) the convention in Nengo is that
            # the input matrices are transpose of what they would be
            # mathematically
            mpop.addDecodedTermination('a',
                                       [[(1.0 / maxlength) * weights[0][e][i]
                                         for i in range(inputd)], zeros],
                                       tauPSC, False)
            mpop.addDecodedTermination('b', [
                zeros,
                [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)]
            ], tauPSC, False)

            # multiply the two selected components together
            mpop.addDecodedOrigin("output", [PostfixFunction('x0*x1', 2)],
                                  "AXON")

            self.addNode(mpop)
            self.addProjection(in1.getOrigin('X'), mpop.getTermination('a'))
            self.addProjection(in2.getOrigin('X'), mpop.getTermination('b'))

            # combine the 1D results back into one vector.
            # we scaled each input by 1/maxlength, then multiplied them
            # together for a total scale of 1/maxlength**2, so to undo we
            # multiply by maxlength**2
            resultTerm[e] = [maxlength**2 * scale]
            result.addDecodedTermination('in_' + str(e), resultTerm, 0.001,
                                         False)
            resultTerm[e] = [0]

            self.addProjection(mpop.getOrigin('output'),
                               result.getTermination('in_' + str(e)))

        self.exposeTermination(in1.getTermination("input"), "A")
        self.exposeTermination(in2.getTermination("input"), "B")
        self.exposeOrigin(result.getOrigin("X"), "X")
Exemplo n.º 12
0
    def __init__(self, stateN, stateD, state_encoders, actions, learningrate,
                 stateradius=1.0, Qradius=1.0, load_weights=None):
        NetworkImpl.__init__(self)
        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)
        
        N = 50
        statelength = math.sqrt(2*stateradius**2)
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = 0.0
        weight_save = 600.0 #period to save weights (realtime, not simulation time)
        
        #set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)
        
        #create state population
        state_fac = HRLutils.node_fac()
        state_fac.setIntercept(IndicatorPDF(0,1))
            
        state_pop = net.make("state_pop", stateN, stateD, 
                              radius=statelength,
                              node_factory=state_fac,
                              encoders=state_encoders)
#                              eval_points=MU.I(stateD))
#        state_pop = net.make_array("state_pop", stateN/stateD, stateD,
#                                   node_factory=state_fac)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        
        net.connect(state_relay, state_pop, pstc=tauPSC)
        
        #create population tied to previous state (to be used in learning)
        saved_state = memory.Memory("saved_state", N*4, stateD, inputscale=50, radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)
        
        net.connect(state_relay, saved_state.getTermination("target"))
        
        old_state_pop = net.make("old_state_pop", stateN, stateD, 
                              radius=statelength,
                              node_factory=state_fac,
                              encoders=state_encoders)
#                              eval_points=MU.I(stateD))
#        old_state_pop = net.make_array("old_state_pop", stateN/stateD, stateD,
#                                   node_factory=state_fac)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        
        net.connect(saved_state, old_state_pop, pstc=tauPSC)
        
        #set up action nodes
        decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders()
        actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders)
        net.add(actionvals)
        
        decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders()
        old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders)
        net.add(old_actionvals)
        
        net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))
        net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state"))
        
        if load_weights != None:
            self.loadWeights(load_weights)
        
            #find error between old_actionvals and actionvals
        valdiff = net.make_array("valdiff", N, num_actions, node_factory = HRLutils.node_fac())
        net.connect(old_actionvals, valdiff, transform=MU.diag([2]*num_actions), pstc=tauPSC)
        net.connect(actionvals, valdiff, transform=MU.diag([-2]*num_actions), pstc=tauPSC)
            #doubling values to get a bigger error signal
        
            #calculate diff between curr_state and saved_state and use that to gate valdiff
        statediff = net.make_array("statediff", N, stateD, intercept=(0.2,1))
        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state, statediff, transform=MU.diag([-1]*stateD), pstc=tauPSC)
        
        net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], 
                    transform = [[-10]*stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC)
        
        net.connect(valdiff, actionvals.getTermination("error"))
        
        #periodically save the weights
        class WeightSaveThread(threading.Thread):
            def __init__(self, func, prefix, period):
                threading.Thread.__init__(self)
                self.func = func
                self.prefix = prefix
                self.period = period
                
            def run(self):
                while True:
                    time.sleep(self.period)
                    self.func(self.prefix)
        wsn = WeightSaveThread(self.saveWeights, os.path.join("weights","tmp"), weight_save)
        wsn.start()
        
        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(old_actionvals.getTermination("error"), "error")
        self.exposeTermination(saved_state.getTermination("transfer"), "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")