def act(self, rs): mes.currentMessage( "selecting action according to current beleived state") action = self.qAgent.policy(self.currentState, rs) self.actHistory = action mes.currentMessage("performing action: " + str(action)) (self.environment).performAction( action) # here actual state is updated self.updatePerceivedTime() mes.currentMessage("perceiving") newState = self.perceive(self.problemStateDefinition) # PARAMETRIZE mes.message("current problem state: " + str(newState)) newGState = self.perceive(self.goalStateDefinition) reward = self.R(newGState) self.rewardHistory = reward mes.currentMessage("Reward:" + str(reward)) mes.currentMessage("learning from previous transition: ") self.qAgent.learn(newState, reward) self.currentState = newState
def __init__(self, fileName, agent, startingState, graph): mes.currentMessage("linking agent") (self.agent) = agent mes.currentMessage("converting SVG file into lines format") (self.lines, self.size) = con._convertFile(fileName) mes.currentMessage("converting lines format into map format") (self.maps) = con._convertLines(self.lines, self.size) mes.currentMessage("retreiving interest points") (self.features, self.ftNames) = _getFeaturesMatrix(self.maps, self.size) mes.settingMessage("world") if (startingState == "c" or startingState == "center" or startingState == "centre"): startingState = int(((self.size)[0] * (self.size)[1]) / 2 + (self.size)[0] / 2) - 1 #self.world = pdm.stochasticMaze(self.size, self.lines, _preDefRewardSet(self.features), startingState) self.world = pdm.stochasticMaze(self.size, self.lines, [], startingState) mes.setMessage("world") self.graph = graph
def policy(self, state, rs, learning=False): (a, stateValue) = self.argMaxQ(state, rs) mes.currentMessage("evaluating state at: " + str(stateValue) + ", with best action: " + str(a)) mes.currentMessage("acting rationally") if not learning: self.previous_state = state self.last_action = a self.last_policy = rs return a
def policy(self, state, rs=0, layer=None): if not layer and layer != 0: layer = len(self.hierarchy) - 1 action = self.hierarchy[layer].policy(state, rs) self.hierarchy[layer].rec(rs) if layer == len(self.hierarchy) - 1: self.updateBNData(state, rs) abstract_state = self.state_abstraction(state, layer, rs) self.updateData(abstract_state, rs, layer) if layer != 0: mes.currentMessage("action: " + str(action) + "/" + str(len(self.hierarchy[layer - 1].Q) - 1)) return self.policy(state, action, layer - 1) return action
def action_abstraction(self, policy, layer): mes.currentMessage("Abstracting action") mes.currentMessage("Policy (" + str(layer) + "," + str(policy) + ") not specialized, splitting in subtasks") if self.max and (len(self.max) <= layer or self.max[layer] <= len(self.hierarchy[layer].Q)): mes.currentMessage("Reached maximum size for layer %i" % layer) return if layer == len(self.hierarchy) - 1: return self.hierarchy[layer + 1].copyAction(policy) self.hierarchy[layer].copyPolicy(policy) self.policy_data[layer][policy]['sd'] *= 1.0 / 4.0 self.policy_data[layer][policy]['mu'] *= 1.0 / 2.0 self.policy_data[layer][policy]['N'] *= 1.0 / 2.0 self.policy_data[layer].append(self.policy_data[layer][policy].copy()) if layer == len(self.hierarchy) - 2: self.bottleneck_data['mu'] = stats.reshape_mean( self.bottleneck_data['mu'])
def train_neural_network(self, train_x, train_y, state=None): if not state: if self.batch: state = self.train_cell else: state = self.cell with aux.tempAssign(self.sess, self.cell, state): fd = {self.xPH: np.array([train_x]), self.yPH: [train_y]} prediction, _, c = (self.sess).run( [self.prediction, self.optimizer, self.cost], feed_dict=fd) mes.currentMessage("Epoch loss: " + str(c)) self.epoch += 1 self.set_train_state(self.getFullState(train_x)) self.set_state(self.train_cell) return (prediction)
def __init__(self, startingState="c", environment="../../files/maze.txt", pars=None, graphic=True, suppressPrint=False): mes.suppress = suppressPrint mes.currentMessage("sensors") (self.sensors, self.sensorsNames) = attachSensors() mes.currentMessage("environment") self.environment = env.environment(environment, self, startingState, graphic) mes.settingMessage("live parameters") self.livePar = par.agentPar(source=pars) mes.setMessage("live parameters") self._setHistory() self.problemStateDefinition = [ "leftWall", "rightWall", "topWall", "bottomWall", "previousAction" ] self.goalStateDefinition = ["exitDetector"] mes.currentMessage("initializing starting internal state") self.currentState = self.perceive( self.problemStateDefinition) #PARAMETRIZE currentGState = self.perceive(self.goalStateDefinition) self.rsSize = 1 if not isinstance(currentGState, list) else len( currentGState) #PARAMETRIZE mes.settingMessage("Action-state values table") #self.qAgent = qLA.hTDWeightBoltzmann(self, len(self.currentState), self.livePar.batchSize, nActions=self.environment.world.numActions, structure=[self.rsSize]) #self.qAgent = qLA.hTDBoltzmann(self, len(self.currentState), self.livePar.batchSize, nActions=self.environment.world.numActions, structure=[self.rsSize]) self.qAgent = qLA.tdBoltzmann(self, self.rsSize, len(self.currentState), self.environment.world.numActions, self.livePar.batchSize) mes.setMessage("Action-state values table") self.graphic = graphic if (self.graphic): mes.currentMessage("initializing render") self.environment._initGraph(self.goalStateDefinition)
def _setHistory(self): mes.currentMessage("initializing perceived time") self.time = 0 mes.currentMessage("initializing reward history") self.rewardHistory = None mes.currentMessage("initializing action history") self.actionHistory = None
def learn(self, newState, r): mes.currentMessage( "Broadcasting reward to previous policy firing chain") if type(r) == list: r = r[0] for layer in self.hierarchy: mes.currentMessage("Current layer: %d" % self.hierarchy.index(layer)) if layer.last_policy or layer.last_policy == 0: mes.currentMessage("Reward sent") reward = (layer.last_policy, r) layer.learn(newState, reward)
def policy(self, state, rs, learning=False): p = self._schedule(self.agent.time) mes.currentMessage("Schedule: " + str(p)) dice = float(rand.randint(0, 100)) / 100 if (dice <= p and not learning): mes.currentMessage("acting randomly, with p: " + str(dice)) a = rand.randint(0, 3) else: mes.currentMessage("acting rationally, with p: " + str(dice)) a = super(simAnneal, self).policy(state, rs) if not learning: self.previous_state = state self.last_action = a self.last_policy = rs return a
def learn(self, newState, r): mes.currentMessage("retrieving parameters") s1 = self.previous_state a = self.last_action s2 = newState if type(r) == tuple: vec_r = [None for i in range(len(self.Q))] vec_r[r[0]] = r[1] r = vec_r elif type(r) != list: r = [r] * (len(self.Q)) mes.currentMessage("learning from transition <" + str(s1) + " , " + str(a) + " , " + str(s2) + " , " + str(r) + ">") _alpha = self.agent.livePar.learningRate _gamma = self.agent.livePar.discountFactor for i in range(len(self.Q)): if r[i]: valueNext = self.stateValue(s2, i) #Q[i][s2][self.policy(s2, i)] mes.currentMessage("computing new state action value") memory = (_alpha) * (self.stateActionValue(s1, a, i) ) #((self.Q)[i][s1][a]) learning = (1 - _alpha) * self.updateValue( r[i], _gamma, valueNext) mes.settingMessage("new state action value") #(self.Q)[i][s1][a] = memory + learning self.updateQ(s1, a, memory + learning, i) mes.setMessage("new state action value")
def __init__(self, input_size, rnn_size, output_size, alpha=-1, session=None, scope="lstm", cell=None, train_cell=None, batch=True): # storing scope name self.scope = aux.uniqueScope(scope) self.alpha = alpha self.batch = batch # setting hyperparameters self.input_size = input_size self.output_size = output_size if type(output_size) != type( {}) else len(output_size[out_bias_key]) self.rnn_size = rnn_size if type(rnn_size) != type({}) else int( len(rnn_size[rnn_bias_key]) / 4) # counting epochs self.epoch = 1 # i/o placeholders self.xPH = tf.placeholder('float', shape=(None, self.input_size)) self.yPH = tf.placeholder('float', shape=(1, self.output_size)) with tf.variable_scope(self.scope): # saving lstm prediciton and state function (w.r.t. input placeholder) self.prediction = self.neural_network_model() # setting cost function (in function of prediction and output placeholder for target values) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.prediction, labels=self.yPH)) # setting optimizer if alpha and alpha > 0: self.optimizer = tf.train.AdamOptimizer( learning_rate=alpha).minimize(self.cost) else: self.optimizer = tf.train.AdamOptimizer().minimize(self.cost) # starting session if not session: session = tf.Session() self.sess = session (self.sess).run(tf.global_variables_initializer()) if type(output_size) == type({}): self.override(self.output_layer[out_weights_key], output_size[out_weights_key]) self.override(self.output_layer[out_bias_key], output_size[out_bias_key]) if type(rnn_size) == type({}): mes.currentMessage("Overriding RNN weights") rnn_size = dict([(self.scope + '/' + name, val) for (name, val) in rnn_size.items()]) for v in tf.global_variables(): if (v.name) in list(rnn_size.keys()): self.override(v, rnn_size[v.name]) if cell: self.sess.run(self.cell.c.assign(cell.c)) self.sess.run(self.cell.h.assign(cell.h)) if train_cell: self.sess.run(self.train_cell.c.assign(train_cell.c)) self.sess.run(self.train_cell.h.assign(train_cell.h))
def task_abstraction(self, rs=0): if self.max and len(self.max) <= len(self.hierarchy): mes.currentMessage( "Reached maximum size for bottom-up abstraction") return mes.warningMessage("Getting network's parameters") ANN = self.hierarchy[-1].Q[rs] parameters = ANN.getCopy() mes.warningMessage("unrolling parameters") W = parameters['out'][lstm.out_weights_key] b = parameters['out'][lstm.out_bias_key] rnn = parameters['rnn'] mes.warningMessage("Getting network shape") size = (np.shape(W)[1]) mes.warningMessage("Computing new weights") new_w = (W * (1 / size)).sum(1) new_W = np.transpose(np.array([new_w, new_w])) mes.warningMessage("Computing new biases") _b = (b * size).sum() new_b = np.array([_b, _b]) mes.warningMessage("Rolling weights and biases") W_pars = {lstm.out_weights_key: new_W, lstm.out_bias_key: new_b} mes.warningMessage("Copying policy") (self.hierarchy)[-1].copyPolicy(rs) #new_ANN = ANN.restart(ANN.input_size, rnn, W_pars, ANN.alpha, self.sess, ANN.scope) mes.warningMessage("Restaring top policy") self.hierarchy.append( self.policyClass(self.agent, 1, self.stateSize, 2, self.batch_size, None)) self.hierarchy[-1].Q[0] = self.hierarchy[-1].Q[0].restart( ANN.input_size, rnn, W_pars, ANN.alpha, None, ANN.scope) mes.currentMessage("Adjusting stats") self.bottleneck_data = self.make_bottleneck_data(2) self.policy_data.append([self.policy_data[-1][rs].copy()]) self.layer_data.append(self.layer_data[-1].copy()) self.policy_data[-2][rs]['sd'] *= 1.0 / 4.0 self.policy_data[-2][rs]['mu'] *= 1.0 / 2.0 self.policy_data[-2].append(self.policy_data[-2][rs].copy()) self.bottleneck_data['mu'] = np.array([0.5, 0.5])