示例#1
0
    def act(self, rs):
        mes.currentMessage(
            "selecting action according to current beleived state")
        action = self.qAgent.policy(self.currentState, rs)

        self.actHistory = action

        mes.currentMessage("performing action: " + str(action))
        (self.environment).performAction(
            action)  # here actual state is updated
        self.updatePerceivedTime()

        mes.currentMessage("perceiving")
        newState = self.perceive(self.problemStateDefinition)  # PARAMETRIZE

        mes.message("current problem state: " + str(newState))
        newGState = self.perceive(self.goalStateDefinition)
        reward = self.R(newGState)
        self.rewardHistory = reward

        mes.currentMessage("Reward:" + str(reward))

        mes.currentMessage("learning from previous transition: ")
        self.qAgent.learn(newState, reward)

        self.currentState = newState
示例#2
0
    def __init__(self, fileName, agent, startingState, graph):
        mes.currentMessage("linking agent")
        (self.agent) = agent
        mes.currentMessage("converting SVG file into lines format")
        (self.lines, self.size) = con._convertFile(fileName)
        mes.currentMessage("converting lines format into map format")
        (self.maps) = con._convertLines(self.lines, self.size)

        mes.currentMessage("retreiving interest points")
        (self.features,
         self.ftNames) = _getFeaturesMatrix(self.maps, self.size)

        mes.settingMessage("world")

        if (startingState == "c" or startingState == "center"
                or startingState == "centre"):
            startingState = int(((self.size)[0] * (self.size)[1]) / 2 +
                                (self.size)[0] / 2) - 1

        #self.world = pdm.stochasticMaze(self.size, self.lines, _preDefRewardSet(self.features), startingState)
        self.world = pdm.stochasticMaze(self.size, self.lines, [],
                                        startingState)
        mes.setMessage("world")

        self.graph = graph
示例#3
0
    def policy(self, state, rs, learning=False):
        (a, stateValue) = self.argMaxQ(state, rs)
        mes.currentMessage("evaluating state at: " + str(stateValue) +
                           ", with best action: " + str(a))

        mes.currentMessage("acting rationally")

        if not learning:
            self.previous_state = state
            self.last_action = a

            self.last_policy = rs

        return a
示例#4
0
    def policy(self, state, rs=0, layer=None):
        if not layer and layer != 0:
            layer = len(self.hierarchy) - 1

        action = self.hierarchy[layer].policy(state, rs)
        self.hierarchy[layer].rec(rs)

        if layer == len(self.hierarchy) - 1:
            self.updateBNData(state, rs)

        abstract_state = self.state_abstraction(state, layer, rs)
        self.updateData(abstract_state, rs, layer)

        if layer != 0:
            mes.currentMessage("action: " + str(action) + "/" +
                               str(len(self.hierarchy[layer - 1].Q) - 1))
            return self.policy(state, action, layer - 1)

        return action
示例#5
0
    def action_abstraction(self, policy, layer):
        mes.currentMessage("Abstracting action")
        mes.currentMessage("Policy (" + str(layer) + "," + str(policy) +
                           ") not specialized, splitting in subtasks")

        if self.max and (len(self.max) <= layer
                         or self.max[layer] <= len(self.hierarchy[layer].Q)):
            mes.currentMessage("Reached maximum size for layer %i" % layer)
            return

        if layer == len(self.hierarchy) - 1:
            return

        self.hierarchy[layer + 1].copyAction(policy)
        self.hierarchy[layer].copyPolicy(policy)

        self.policy_data[layer][policy]['sd'] *= 1.0 / 4.0
        self.policy_data[layer][policy]['mu'] *= 1.0 / 2.0
        self.policy_data[layer][policy]['N'] *= 1.0 / 2.0

        self.policy_data[layer].append(self.policy_data[layer][policy].copy())

        if layer == len(self.hierarchy) - 2:
            self.bottleneck_data['mu'] = stats.reshape_mean(
                self.bottleneck_data['mu'])
示例#6
0
文件: LSTM.py 项目: daddabarba/NHRL
    def train_neural_network(self, train_x, train_y, state=None):

        if not state:
            if self.batch:
                state = self.train_cell
            else:
                state = self.cell

        with aux.tempAssign(self.sess, self.cell, state):

            fd = {self.xPH: np.array([train_x]), self.yPH: [train_y]}

            prediction, _, c = (self.sess).run(
                [self.prediction, self.optimizer, self.cost], feed_dict=fd)
            mes.currentMessage("Epoch loss: " + str(c))

            self.epoch += 1

            self.set_train_state(self.getFullState(train_x))
        self.set_state(self.train_cell)

        return (prediction)
示例#7
0
    def __init__(self,
                 startingState="c",
                 environment="../../files/maze.txt",
                 pars=None,
                 graphic=True,
                 suppressPrint=False):

        mes.suppress = suppressPrint

        mes.currentMessage("sensors")
        (self.sensors, self.sensorsNames) = attachSensors()

        mes.currentMessage("environment")
        self.environment = env.environment(environment, self, startingState,
                                           graphic)

        mes.settingMessage("live parameters")
        self.livePar = par.agentPar(source=pars)
        mes.setMessage("live parameters")

        self._setHistory()

        self.problemStateDefinition = [
            "leftWall", "rightWall", "topWall", "bottomWall", "previousAction"
        ]
        self.goalStateDefinition = ["exitDetector"]

        mes.currentMessage("initializing starting internal state")
        self.currentState = self.perceive(
            self.problemStateDefinition)  #PARAMETRIZE
        currentGState = self.perceive(self.goalStateDefinition)
        self.rsSize = 1 if not isinstance(currentGState, list) else len(
            currentGState)  #PARAMETRIZE

        mes.settingMessage("Action-state values table")
        #self.qAgent = qLA.hTDWeightBoltzmann(self, len(self.currentState), self.livePar.batchSize, nActions=self.environment.world.numActions, structure=[self.rsSize])
        #self.qAgent = qLA.hTDBoltzmann(self, len(self.currentState), self.livePar.batchSize, nActions=self.environment.world.numActions, structure=[self.rsSize])
        self.qAgent = qLA.tdBoltzmann(self, self.rsSize,
                                      len(self.currentState),
                                      self.environment.world.numActions,
                                      self.livePar.batchSize)
        mes.setMessage("Action-state values table")

        self.graphic = graphic

        if (self.graphic):
            mes.currentMessage("initializing render")
            self.environment._initGraph(self.goalStateDefinition)
示例#8
0
    def _setHistory(self):
        mes.currentMessage("initializing perceived time")
        self.time = 0

        mes.currentMessage("initializing reward history")
        self.rewardHistory = None

        mes.currentMessage("initializing action history")
        self.actionHistory = None
示例#9
0
    def learn(self, newState, r):

        mes.currentMessage(
            "Broadcasting reward to previous policy firing chain")

        if type(r) == list:
            r = r[0]

        for layer in self.hierarchy:

            mes.currentMessage("Current layer: %d" %
                               self.hierarchy.index(layer))

            if layer.last_policy or layer.last_policy == 0:
                mes.currentMessage("Reward sent")

                reward = (layer.last_policy, r)
                layer.learn(newState, reward)
示例#10
0
    def policy(self, state, rs, learning=False):
        p = self._schedule(self.agent.time)
        mes.currentMessage("Schedule: " + str(p))

        dice = float(rand.randint(0, 100)) / 100

        if (dice <= p and not learning):
            mes.currentMessage("acting randomly, with p: " + str(dice))
            a = rand.randint(0, 3)
        else:
            mes.currentMessage("acting rationally, with p: " + str(dice))
            a = super(simAnneal, self).policy(state, rs)

        if not learning:
            self.previous_state = state
            self.last_action = a

            self.last_policy = rs

        return a
示例#11
0
    def learn(self, newState, r):
        mes.currentMessage("retrieving parameters")

        s1 = self.previous_state
        a = self.last_action
        s2 = newState

        if type(r) == tuple:
            vec_r = [None for i in range(len(self.Q))]
            vec_r[r[0]] = r[1]
            r = vec_r
        elif type(r) != list:
            r = [r] * (len(self.Q))

        mes.currentMessage("learning from transition <" + str(s1) + " , " +
                           str(a) + " , " + str(s2) + " , " + str(r) + ">")

        _alpha = self.agent.livePar.learningRate
        _gamma = self.agent.livePar.discountFactor

        for i in range(len(self.Q)):
            if r[i]:
                valueNext = self.stateValue(s2,
                                            i)  #Q[i][s2][self.policy(s2, i)]

                mes.currentMessage("computing new state action value")
                memory = (_alpha) * (self.stateActionValue(s1, a, i)
                                     )  #((self.Q)[i][s1][a])
                learning = (1 - _alpha) * self.updateValue(
                    r[i], _gamma, valueNext)

                mes.settingMessage("new state action value")
                #(self.Q)[i][s1][a] = memory + learning
                self.updateQ(s1, a, memory + learning, i)

                mes.setMessage("new state action value")
示例#12
0
文件: LSTM.py 项目: daddabarba/NHRL
    def __init__(self,
                 input_size,
                 rnn_size,
                 output_size,
                 alpha=-1,
                 session=None,
                 scope="lstm",
                 cell=None,
                 train_cell=None,
                 batch=True):
        # storing scope name
        self.scope = aux.uniqueScope(scope)

        self.alpha = alpha
        self.batch = batch

        # setting hyperparameters
        self.input_size = input_size
        self.output_size = output_size if type(output_size) != type(
            {}) else len(output_size[out_bias_key])
        self.rnn_size = rnn_size if type(rnn_size) != type({}) else int(
            len(rnn_size[rnn_bias_key]) / 4)

        # counting epochs
        self.epoch = 1

        # i/o placeholders
        self.xPH = tf.placeholder('float', shape=(None, self.input_size))
        self.yPH = tf.placeholder('float', shape=(1, self.output_size))

        with tf.variable_scope(self.scope):

            # saving lstm prediciton and state function (w.r.t. input placeholder)
            self.prediction = self.neural_network_model()
            # setting cost function (in function of prediction and output placeholder for target values)
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=self.prediction, labels=self.yPH))
            # setting optimizer
            if alpha and alpha > 0:
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=alpha).minimize(self.cost)
            else:
                self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)

        # starting session
        if not session:
            session = tf.Session()

        self.sess = session
        (self.sess).run(tf.global_variables_initializer())

        if type(output_size) == type({}):
            self.override(self.output_layer[out_weights_key],
                          output_size[out_weights_key])
            self.override(self.output_layer[out_bias_key],
                          output_size[out_bias_key])

        if type(rnn_size) == type({}):
            mes.currentMessage("Overriding RNN weights")

            rnn_size = dict([(self.scope + '/' + name, val)
                             for (name, val) in rnn_size.items()])

            for v in tf.global_variables():
                if (v.name) in list(rnn_size.keys()):
                    self.override(v, rnn_size[v.name])

        if cell:
            self.sess.run(self.cell.c.assign(cell.c))
            self.sess.run(self.cell.h.assign(cell.h))

        if train_cell:
            self.sess.run(self.train_cell.c.assign(train_cell.c))
            self.sess.run(self.train_cell.h.assign(train_cell.h))
示例#13
0
    def task_abstraction(self, rs=0):

        if self.max and len(self.max) <= len(self.hierarchy):
            mes.currentMessage(
                "Reached maximum size for bottom-up abstraction")
            return

        mes.warningMessage("Getting network's parameters")

        ANN = self.hierarchy[-1].Q[rs]
        parameters = ANN.getCopy()

        mes.warningMessage("unrolling parameters")

        W = parameters['out'][lstm.out_weights_key]
        b = parameters['out'][lstm.out_bias_key]
        rnn = parameters['rnn']

        mes.warningMessage("Getting network shape")

        size = (np.shape(W)[1])

        mes.warningMessage("Computing new weights")

        new_w = (W * (1 / size)).sum(1)
        new_W = np.transpose(np.array([new_w, new_w]))

        mes.warningMessage("Computing new biases")

        _b = (b * size).sum()
        new_b = np.array([_b, _b])

        mes.warningMessage("Rolling weights and biases")

        W_pars = {lstm.out_weights_key: new_W, lstm.out_bias_key: new_b}

        mes.warningMessage("Copying policy")

        (self.hierarchy)[-1].copyPolicy(rs)
        #new_ANN = ANN.restart(ANN.input_size, rnn, W_pars, ANN.alpha, self.sess, ANN.scope)

        mes.warningMessage("Restaring top policy")

        self.hierarchy.append(
            self.policyClass(self.agent, 1, self.stateSize, 2, self.batch_size,
                             None))
        self.hierarchy[-1].Q[0] = self.hierarchy[-1].Q[0].restart(
            ANN.input_size, rnn, W_pars, ANN.alpha, None, ANN.scope)

        mes.currentMessage("Adjusting stats")

        self.bottleneck_data = self.make_bottleneck_data(2)

        self.policy_data.append([self.policy_data[-1][rs].copy()])
        self.layer_data.append(self.layer_data[-1].copy())

        self.policy_data[-2][rs]['sd'] *= 1.0 / 4.0
        self.policy_data[-2][rs]['mu'] *= 1.0 / 2.0

        self.policy_data[-2].append(self.policy_data[-2][rs].copy())

        self.bottleneck_data['mu'] = np.array([0.5, 0.5])