def train(self): if len(self.pybdataset) == 0: return # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.network, dataset=self.pybdataset, batchlearning=True, verbose=False) # trainer = BackpropTrainer(self.network, dataset=self.pybdataset, batchlearning=True, verbose=True) trainer.trainEpochs(100)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience inp = r_[state_, one_to_n(action_[0], self.module.numActions)] tgt = reward_ + self.gamma * max(self.module.getActionValues(state)) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) # alternative: backprop, was not as stable as rprop # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True) trainer.trainEpochs(1)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, int(action_[0])) inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)] #input = r_[state_, action_] tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def makeGreedy1D(valNet, polNet, policyEvalStates, numAct, stepSize): from pybrain.datasets import SupervisedDataSet supervised = SupervisedDataSet(polNet.indim, numAct) # numInput, numOutputs # Try all the actions and see which has the best value for state in policyEvalStates: vBest = -100000 for action in range(numAct): nextState = [ep.updateDist(state, stepSize, numAct, action)] vNext = valNet.activate(nextState) if (vNext > vBest): actBest = action vBest = vNext from pybrain.utilities import one_to_n supervised.addSample(state, one_to_n(actBest, numAct)) # Print supervised training set # print(supervised) # input() # Train neural network from pybrain.supervised.trainers.rprop import RPropMinusTrainer trainer = RPropMinusTrainer(polNet, dataset=supervised, verbose=False) trainer.trainUntilConvergence(maxEpochs=50) # I'm OK with some interpolation here. It's the values we need to be exact on. return polNet
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset[self.indexOfAgent]: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] if self.isFirstLerning: tgt = reward_ else: tgt = Q + 0.5 * (reward_ + self.gamma * max( self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) #for reward normalization # update last experience with current one lastexperience = (state, action, reward) #Re-building netowrks is required in multiprocessing environments. params = self.module.network.params self.module.network = buildNetwork( self.module.indim + self.module.numActions, self.module.indim + self.module.numActions, 1) self.module.network._setParameters(params) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) #, weightdecay=0.01) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) if self.isFirstLerning: self.isFirstLerning = False
def train(self, transitionSamples): print "Entrenando..." k = 0 trainer = RPropMinusTrainer(self.Q, batchlearning=True) #trainer = BackpropTrainer(self.Q, batchlearning=False) TS = SupervisedDataSet(4, 1) while (k < self._epochs): if k % 10 == 0: print "\t ", k # Genero training set en base a las muestras # Input: Vector de 4 dimensiones (angulo, vel.angular, pos, accion) # Target: Valor TS.clear() for s, a, s_1, costo in transitionSamples: # Tomo Q para s', para todas las acciones posibles # (vector con el valor para s', para cada una de las 3 acciones posibles) # Q_s1 = [ self.Q.activate([s_1.angulo, s_1.velocidadAngular, s_1.posicion, b]) for b in range(Accion.maxValor + 1) ] valDerecha = self.Q.activate([ s_1.angulo, s_1.velocidadAngular, s_1.posicion, Accion.DERECHA ]) valIzquierda = self.Q.activate([ s_1.angulo, s_1.velocidadAngular, s_1.posicion, Accion.IZQUIERDA ]) if valDerecha >= 1 or valDerecha <= 0: print "Q incorrecta: ", valDerecha if valIzquierda >= 1 or valIzquierda <= 0: print "Q incorrecta: ", valIzquierda # Input y Target para la red neuronal inputVal = (s.angulo, s.velocidadAngular, s.posicion, a) if costo == 0: targetVal = costo else: targetVal = costo + self._gamma * min( valDerecha, valIzquierda) if targetVal > 1 or targetVal < 0: print "Target incorrecto: ", targetVal TS.addSample(inputVal, targetVal) # Entreno la red neuronal trainer.setData(TS) trainer.train() # 1 epoch #trainer.trainEpochs(self._epochsNN) k = k + 1
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset[self.indexOfAgent]: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] if self.isFirstLerning: tgt = reward_ else: tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) #for reward normalization # update last experience with current one lastexperience = (state, action, reward) #Re-building netowrks is required in multiprocessing environments. params=self.module.network.params self.module.network=buildNetwork(self.module.indim+self.module.numActions, self.module.indim+self.module.numActions, 1) self.module.network._setParameters(params) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) if self.isFirstLerning: self.isFirstLerning=False
from utils import updateDataset,buildDataset,buildRecurrentNetwork,loadRecurrentNetwork from pybrain.supervised.trainers.rprop import RPropMinusTrainer from pybrain.tools.xml.networkwriter import NetworkWriter from pybrain.tools.xml.networkreader import NetworkReader #nn=buildRecurrentNetwork() nn=loadRecurrentNetwork('recurrentNetwork.xml') dataset=buildDataset() trainer=RPropMinusTrainer(nn) trainer.setData(dataset) print 'dataset set for trainer' trainer.trainUntilConvergence() print 'trained to convergence' NetworkWriter.writeToFile(nn,'recurrentNetwork.xml')
from utils import updateDataset, buildDataset, buildRecurrentNetwork, loadRecurrentNetwork from pybrain.supervised.trainers.rprop import RPropMinusTrainer from pybrain.tools.xml.networkwriter import NetworkWriter from pybrain.tools.xml.networkreader import NetworkReader #nn=buildRecurrentNetwork() nn = loadRecurrentNetwork('recurrentNetwork.xml') dataset = buildDataset() trainer = RPropMinusTrainer(nn) trainer.setData(dataset) print 'dataset set for trainer' trainer.trainUntilConvergence() print 'trained to convergence' NetworkWriter.writeToFile(nn, 'recurrentNetwork.xml')
def test_multilayer_perceptron(): def plot(fig, data): ax = fig.add_subplot(111) ax.plot([x[0] for x in data], [x[1] for x in data]) def scat(fig, liner_data, marker='o', color='g'): ax = fig.add_subplot(111) ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data], marker=marker, color=color, s=10) def get_predict_list(x_range, y_range, nn, split=10): data = [] xspan = float(x_range[1] - x_range[0]) / split yspan = float(y_range[1] - y_range[0]) / split for x_value in [float(i) * xspan + x_range[0] for i in range(split)]: predict_list = [] for y_value in [ float(j) * yspan + y_range[0] for j in range(split) ]: #if nn.predict([x_value,y_value])[0] >= 0.5: if nn.activate([x_value, y_value])[0] >= 0.5: data.append((x_value, y_value)) break return data import matplotlib.pyplot as plt """ トレーニングデータ取得 """ x_range = [0, 1] y_range = [0, 1] #liner_data = liner_training_data(x_range, y_range) liner_data = quadratic_function_data(x_range, y_range, split=20) #liner_data = sin_function_data(x_range, y_range, 20) train_data_input, train_data_output = change_format(liner_data) fig = plt.figure() scat(fig, [key for key, value in liner_data.items() if value == 0], color='g') scat(fig, [key for key, value in liner_data.items() if value == 1], color='b') """ NN構築 """ network = build_network() # mlnn = MultiLayerNeuralNetwork( [2, 5, 1], # threshold=0.1, # start_learning_coef=0.2, # sigmoid_alpha=10, # mini_batch=100, # layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer], # rprop=True # ) """ 学習 """ #error_hist = mlnn.train_multi(train_data_input, train_data_output) supervised = get_supervised(network, train_data_input, train_data_output) trainer = RPropMinusTrainer(network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=100) # xに対応するyを算出, 学習後分離線書く data = get_predict_list(x_range, y_range, network, split=20) plot(fig, data) # # エラー表示 # fig2 = plt.figure() # plot(fig2, error_hist) # 表示 plt.show()
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = [] dats=[]#[seq index][turn]=[state,jointAct,jointReward] for i in range(self.num_agents): supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1)) for i in range(self.dataset[self.indexOfAgent].getNumSequences()): seq=[] for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])): state=self.dataset[self.indexOfAgent].getSequence(i)[0][j] jointAct=[] jointReward=[] for k in range(self.num_agents): jointAct.append(self.dataset[k].getSequence(i)[1][j][0]) jointReward.append(self.dataset[k].getSequence(i)[2][j][0]) seq.append([state, jointAct, jointReward]) dats.append(seq) #prepare data set for i in range(self.num_agents): for seq in dats: lastexperience = None for sarPair in seq: state = sarPair[0] action = sarPair[1] reward = sarPair[2] if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience #update Q-value function approximator qValuesNext=self._qValuesForAllPossibleJointAction(state) eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing) #Learn inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_) if self.isFirstLerning: target=reward_[i] else: target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i)) target=np.array([target]) supervised[i].addSample(inp, target) # update last experience with current one lastexperience = (state, action, reward) if self.isFirstLerning: self.isFirstLerning=False procTrainers=[] qResult=Queue() for i in range(self.num_agents): trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], batchlearning=True, verbose=False, ) if not self.validateMultiProc: trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False) else: procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult})) if self.validateMultiProc: for proc in procTrainers: proc.start() for i in range(self.num_agents): res=qResult.get() self.linQ[res[0]]=res[1]
def test_multilayer_perceptron(): def plot(fig, data): ax = fig.add_subplot(111) ax.plot([x[0] for x in data], [x[1] for x in data]) def scat(fig, liner_data, marker='o', color='g'): ax = fig.add_subplot(111) ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data], marker=marker, color=color, s=10) def get_predict_list(x_range, y_range, nn, split=10): data = [] xspan = float(x_range[1] - x_range[0]) / split yspan = float(y_range[1] - y_range[0]) / split for x_value in [ float(i)*xspan+x_range[0] for i in range(split)]: predict_list = [] for y_value in [ float(j) * yspan + y_range[0] for j in range(split)]: #if nn.predict([x_value,y_value])[0] >= 0.5: if nn.activate([x_value,y_value])[0] >= 0.5: data.append((x_value, y_value)) break return data import matplotlib.pyplot as plt """ トレーニングデータ取得 """ x_range = [0,1] y_range = [0,1] #liner_data = liner_training_data(x_range, y_range) liner_data = quadratic_function_data(x_range, y_range, split=20) #liner_data = sin_function_data(x_range, y_range, 20) train_data_input, train_data_output = change_format(liner_data) fig = plt.figure() scat(fig, [key for key, value in liner_data.items() if value == 0], color='g' ) scat(fig, [key for key, value in liner_data.items() if value == 1], color='b' ) """ NN構築 """ network = build_network() # mlnn = MultiLayerNeuralNetwork( [2, 5, 1], # threshold=0.1, # start_learning_coef=0.2, # sigmoid_alpha=10, # mini_batch=100, # layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer], # rprop=True # ) """ 学習 """ #error_hist = mlnn.train_multi(train_data_input, train_data_output) supervised = get_supervised(network, train_data_input, train_data_output) trainer = RPropMinusTrainer(network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=100) # xに対応するyを算出, 学習後分離線書く data = get_predict_list(x_range,y_range, network, split=20) plot(fig, data) # # エラー表示 # fig2 = plt.figure() # plot(fig2, error_hist) # 表示 plt.show()
def evalPolicy1D(valNet,polNet,policyEvalStates, vMaxAll, stepSize, thermRadius): vDiffStart = 10000 vDiff = vDiffStart while(vDiff > vMaxAll): vDiff = vDiffStart for state in policyEvalStates: # Go through the states in question # Stores next state according to the current policy nextState = []; # Determine what the chosen action is, from the policy network actionPref = polNet.activate([state]) chosenAction = np.argmax(actionPref) # Choose the one with highest output # Determine the next state (from contThermalEnvironment) numAng = len(actionPref) oldDist = state nextState = [ep.updateDist(oldDist, stepSize, numAng, chosenAction)] # Calculate reward given for transition # Calculate new value of states under the current policy, based on reward given # Discount rate is how farsighted we are (between 0 and 1, with 1 being very far sighted, and 0 being not far sighted) discRate = 0.7 scale = 10 # Size of reward reward = getReward1D(state, thermRadius,scale) # Calculate new estimate for value VstateNew = reward + discRate*valNet.activate(nextState); # Determine how much the value changed # Keep track of maximum change seen so far VstateOld = valNet.activate([state]) vChange = abs(VstateOld - VstateNew) if (vDiff == vDiffStart): vDiff = vChange elif (vChange > vDiff): vDiff = vChange # Update value network with new estimate, keeping everything else the same # First, get training examples from pybrain.datasets import SupervisedDataSet supervised = SupervisedDataSet(valNet.indim, 1) # numInput, numOutputs supervised.addSample(state, VstateNew) for loc in policyEvalStates: # Go through all discretized states if (loc != state): inp = loc tgt = valNet.activate([loc]) supervised.addSample(inp,tgt) # Next, train on these training examples from pybrain.supervised.trainers.rprop import RPropMinusTrainer trainer = RPropMinusTrainer(valNet, dataset=supervised, verbose=False) # Train manually, to avoid using validation data # trainer.trainUntilConvergence(maxEpochs=maxEpochsVal, validationProportion = 0) # Requires validation data # I don't mind overfitting this - just so long as generalization is OK (so far, seems OK) numTrainIter = 30 for i in range(numTrainIter): trainer.train() # Print training status # print('Old dist:', oldDist) # print('Preferences:', actionPref) # print('Choice:', chosenAction) # print('New dist:', nextState) # print('Reward:', reward) # print('New Value:', VstateNew) # print('Value change:', vChange) # print('Max change:', vDiff) # print('Supervised data set:', supervised) # print('Actual network outputs:') # for loc in policyEvalStates: # print(valNet.activate([loc])) # input() # Return updated vallue function print('Max value change: ', vDiff) import sys ;sys.stdout.flush() return valNet