Exemplo n.º 1
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)

        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
Exemplo n.º 2
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)

        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = Q + 0.5 * (reward_ + self.gamma *
                                 max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network,
                                    dataset=supervised,
                                    batchlearning=True,
                                    verbose=False)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def makeGreedy1D(valNet, polNet, policyEvalStates, numAct, stepSize):

    from pybrain.datasets import SupervisedDataSet                
    supervised = SupervisedDataSet(polNet.indim, numAct) # numInput, numOutputs   
    
    # Try all the actions and see which has the best value    
    for state in policyEvalStates:
        vBest = -100000
        for action in range(numAct):            
            nextState = [ep.updateDist(state, stepSize, numAct, action)]
            vNext = valNet.activate(nextState)
            if (vNext > vBest):
                actBest = action
                vBest = vNext
        from pybrain.utilities import one_to_n
        supervised.addSample(state, one_to_n(actBest, numAct))
    
    # Print supervised training set 
    # print(supervised)
    # input()
    
    # Train neural network
    from pybrain.supervised.trainers.rprop import RPropMinusTrainer                
    trainer = RPropMinusTrainer(polNet, dataset=supervised, verbose=False)  
    trainer.trainUntilConvergence(maxEpochs=50) # I'm OK with some interpolation here. It's the values we need to be exact on.
    return polNet
Exemplo n.º 4
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        for seq in self.dataset[self.indexOfAgent]:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                if self.isFirstLerning:
                    tgt = reward_
                else:
                    tgt = Q + 0.5 * (reward_ + self.gamma * max(
                        self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                #for reward normalization

                # update last experience with current one
                lastexperience = (state, action, reward)

        #Re-building netowrks is required in multiprocessing environments.
        params = self.module.network.params
        self.module.network = buildNetwork(
            self.module.indim + self.module.numActions,
            self.module.indim + self.module.numActions, 1)
        self.module.network._setParameters(params)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network,
                                    dataset=supervised,
                                    batchlearning=True,
                                    verbose=False)  #, weightdecay=0.01)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
        if self.isFirstLerning:
            self.isFirstLerning = False
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = SupervisedDataSet(self.module.network.indim, 1)
     for seq in self.dataset[self.indexOfAgent]:
         lastexperience = None
         for state, action, reward in seq:
             if not lastexperience:
                 # delay each experience in sequence by one
                 lastexperience = (state, action, reward)
                 continue
             
             # use experience from last timestep to do Q update
             (state_, action_, reward_) = lastexperience
             
             Q = self.module.getValue(state_, action_[0])
             
             inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
             if self.isFirstLerning:
                 tgt = reward_
             else:
                 tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
             supervised.addSample(inp, tgt)
             
             #for reward normalization
             
             # update last experience with current one
             lastexperience = (state, action, reward)
             
     #Re-building netowrks is required in multiprocessing environments. 
     params=self.module.network.params
     self.module.network=buildNetwork(self.module.indim+self.module.numActions, 
                                      self.module.indim+self.module.numActions, 
                                      1)
     self.module.network._setParameters(params)
     
     # train module with backprop/rprop on dataset
     trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01)
     trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
     if self.isFirstLerning:
         self.isFirstLerning=False
Exemplo n.º 6
0
from utils import updateDataset,buildDataset,buildRecurrentNetwork,loadRecurrentNetwork
from pybrain.supervised.trainers.rprop import RPropMinusTrainer
from pybrain.tools.xml.networkwriter import NetworkWriter
from pybrain.tools.xml.networkreader import NetworkReader

#nn=buildRecurrentNetwork()
nn=loadRecurrentNetwork('recurrentNetwork.xml')
dataset=buildDataset()

trainer=RPropMinusTrainer(nn)
trainer.setData(dataset)
print 'dataset set for trainer'
trainer.trainUntilConvergence()
print 'trained to convergence'


NetworkWriter.writeToFile(nn,'recurrentNetwork.xml')
Exemplo n.º 7
0
from utils import updateDataset, buildDataset, buildRecurrentNetwork, loadRecurrentNetwork
from pybrain.supervised.trainers.rprop import RPropMinusTrainer
from pybrain.tools.xml.networkwriter import NetworkWriter
from pybrain.tools.xml.networkreader import NetworkReader

#nn=buildRecurrentNetwork()
nn = loadRecurrentNetwork('recurrentNetwork.xml')
dataset = buildDataset()

trainer = RPropMinusTrainer(nn)
trainer.setData(dataset)
print 'dataset set for trainer'
trainer.trainUntilConvergence()
print 'trained to convergence'

NetworkWriter.writeToFile(nn, 'recurrentNetwork.xml')
def test_multilayer_perceptron():
    def plot(fig, data):
        ax = fig.add_subplot(111)
        ax.plot([x[0] for x in data], [x[1] for x in data])

    def scat(fig, liner_data, marker='o', color='g'):
        ax = fig.add_subplot(111)
        ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data],
                   marker=marker,
                   color=color,
                   s=10)

    def get_predict_list(x_range, y_range, nn, split=10):
        data = []
        xspan = float(x_range[1] - x_range[0]) / split
        yspan = float(y_range[1] - y_range[0]) / split

        for x_value in [float(i) * xspan + x_range[0] for i in range(split)]:
            predict_list = []
            for y_value in [
                    float(j) * yspan + y_range[0] for j in range(split)
            ]:
                #if nn.predict([x_value,y_value])[0] >= 0.5:
                if nn.activate([x_value, y_value])[0] >= 0.5:
                    data.append((x_value, y_value))
                    break
        return data

    import matplotlib.pyplot as plt
    """ トレーニングデータ取得
    """
    x_range = [0, 1]
    y_range = [0, 1]
    #liner_data = liner_training_data(x_range, y_range)
    liner_data = quadratic_function_data(x_range, y_range, split=20)
    #liner_data = sin_function_data(x_range, y_range, 20)
    train_data_input, train_data_output = change_format(liner_data)

    fig = plt.figure()
    scat(fig, [key for key, value in liner_data.items() if value == 0],
         color='g')
    scat(fig, [key for key, value in liner_data.items() if value == 1],
         color='b')
    """ NN構築
    """
    network = build_network()

    # mlnn = MultiLayerNeuralNetwork( [2, 5, 1],
    #                                 threshold=0.1,
    #                                 start_learning_coef=0.2,
    #                                 sigmoid_alpha=10,
    #                                 mini_batch=100,
    #                                 layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer],
    #                                 rprop=True
    #                                 )
    """ 学習
    """
    #error_hist = mlnn.train_multi(train_data_input, train_data_output)
    supervised = get_supervised(network, train_data_input, train_data_output)
    trainer = RPropMinusTrainer(network,
                                dataset=supervised,
                                batchlearning=True,
                                verbose=True)
    trainer.trainUntilConvergence(maxEpochs=100)

    # xに対応するyを算出, 学習後分離線書く
    data = get_predict_list(x_range, y_range, network, split=20)
    plot(fig, data)

    # # エラー表示
    # fig2 = plt.figure()
    # plot(fig2, error_hist)

    # 表示
    plt.show()
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = []
     dats=[]#[seq index][turn]=[state,jointAct,jointReward]
     for i in range(self.num_agents):
         supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1))
     for i in range(self.dataset[self.indexOfAgent].getNumSequences()):            
         seq=[]
         for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])):
             state=self.dataset[self.indexOfAgent].getSequence(i)[0][j]
             jointAct=[]
             jointReward=[]
             for k in range(self.num_agents):
                 jointAct.append(self.dataset[k].getSequence(i)[1][j][0])
                 jointReward.append(self.dataset[k].getSequence(i)[2][j][0])
             seq.append([state, jointAct, jointReward])
         dats.append(seq)
     #prepare data set
     for i in range(self.num_agents):
         for seq in dats:
             lastexperience = None
             for sarPair in seq:
                 state = sarPair[0]
                 action = sarPair[1]
                 reward = sarPair[2]
                 if not lastexperience:
                     # delay each experience in sequence by one
                     lastexperience = (state, action, reward)
                     continue
                 # use experience from last timestep to do Q update
                 (state_, action_, reward_) = lastexperience
                 
                 #update Q-value function approximator
                 qValuesNext=self._qValuesForAllPossibleJointAction(state)
                 eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing)
                 #Learn
                 inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_)
                 if self.isFirstLerning:
                     target=reward_[i]
                 else:
                     target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i))
                 target=np.array([target])
                 supervised[i].addSample(inp, target)
                 # update last experience with current one
                 lastexperience = (state, action, reward)
     if self.isFirstLerning:
         self.isFirstLerning=False
         
     procTrainers=[]
     qResult=Queue()
     for i in range(self.num_agents):
         trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], 
                                   batchlearning=True, 
                                   verbose=False, 
                                   )
         if not self.validateMultiProc:
             trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False)
         else:
             procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult}))
     if self.validateMultiProc:
         for proc in procTrainers:
             proc.start()
         for i in range(self.num_agents):
             res=qResult.get()
             self.linQ[res[0]]=res[1]
def test_multilayer_perceptron():

    def plot(fig, data):
        ax  = fig.add_subplot(111)
        ax.plot([x[0] for x in data], [x[1] for x in data])

    def scat(fig, liner_data, marker='o', color='g'):
        ax  = fig.add_subplot(111)
        ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data], marker=marker, color=color, s=10)

    def get_predict_list(x_range, y_range, nn, split=10):
        data = []
        xspan = float(x_range[1] - x_range[0]) / split
        yspan = float(y_range[1] - y_range[0]) / split

        for x_value in [ float(i)*xspan+x_range[0] for i in range(split)]:
            predict_list = []
            for y_value in [ float(j) * yspan + y_range[0]  for j in range(split)]:
                #if nn.predict([x_value,y_value])[0] >= 0.5:
                if nn.activate([x_value,y_value])[0] >= 0.5:
                    data.append((x_value, y_value))
                    break
        return data

    import matplotlib.pyplot as plt

    """ トレーニングデータ取得
    """
    x_range = [0,1]
    y_range = [0,1]
    #liner_data = liner_training_data(x_range, y_range)
    liner_data = quadratic_function_data(x_range, y_range, split=20)
    #liner_data = sin_function_data(x_range, y_range, 20)
    train_data_input, train_data_output = change_format(liner_data)

    fig = plt.figure()
    scat(fig, [key for key, value in liner_data.items() if value == 0], color='g' )
    scat(fig, [key for key, value in liner_data.items() if value == 1], color='b' )



    """ NN構築
    """
    network = build_network()

    # mlnn = MultiLayerNeuralNetwork( [2, 5, 1],
    #                                 threshold=0.1,
    #                                 start_learning_coef=0.2,
    #                                 sigmoid_alpha=10,
    #                                 mini_batch=100,
    #                                 layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer],
    #                                 rprop=True
    #                                 )

    """ 学習
    """
    #error_hist = mlnn.train_multi(train_data_input, train_data_output)
    supervised = get_supervised(network, train_data_input, train_data_output)
    trainer = RPropMinusTrainer(network, dataset=supervised, batchlearning=True, verbose=True)
    trainer.trainUntilConvergence(maxEpochs=100)


    # xに対応するyを算出, 学習後分離線書く
    data = get_predict_list(x_range,y_range, network, split=20)
    plot(fig, data)

    # # エラー表示
    # fig2 = plt.figure()
    # plot(fig2, error_hist)

    # 表示
    plt.show()