class GPSLearner(Q): def __init__(self, ): Q.__init__(self, const.ALPHA, const.GAMMA) self.explorer = FeasibleEpsilonGreedyExplorer(const.EPSILON, const.DECAY) self.dataset2 = ReinforcementDataSet(1, 1) def learn(self): """ Performs Q learning based on observations but also performs learning on states that are adjacent time periods, albeit with a slower learning rate. For example, traffic at 4:30PM on an edge will be somewhat similar to traffic at 5:00. Hence, we can use an observation at 4:30 to update 5:00. """ self.alpha = const.ALPHA Q.learn(self) #do normal learning for seq in self.dataset: self.dataset2.newSequence() for state, action, reward in seq: #add states of adjacent time periods #print(state, action, reward) period = state % const.PERIODS node = int(state / const.PERIODS) self.dataset2.addSample(node * const.PERIODS + (period + 1) % const.PERIODS, action, reward) self.dataset2.addSample(node * const.PERIODS + (period - 1) % const.PERIODS, action, reward) temp = self.dataset self.dataset = self.dataset2 self.alpha = const.ALPHA_ADJ_PERIOD Q.learn(self) self.dataset = temp self.dataset2.clear() #GPSLearner().learn()
class LoggingAgent(Agent): """ This agent stores actions, states, and rewards encountered during interaction with an environment in a ReinforcementDataSet (which is a variation of SequentialDataSet). The stored history can be used for learning and is erased by resetting the agent. It also makes sure that integrateObservation, getAction and giveReward are called in exactly that order. """ logging = True lastobs = None lastaction = None lastreward = None def __init__(self, indim, outdim, **kwargs): self.setArgs(**kwargs) # store input and output dimension self.indim = indim self.outdim = outdim # create the history dataset self.history = ReinforcementDataSet(indim, outdim) def integrateObservation(self, obs): """Step 1: store the observation received in a temporary variable until action is called and reward is given. """ self.lastobs = obs self.lastaction = None self.lastreward = None def getAction(self): """Step 2: store the action in a temporary variable until reward is given. """ assert self.lastobs != None assert self.lastaction == None assert self.lastreward == None # implement getAction in subclass and set self.lastaction def giveReward(self, r): """Step 3: store observation, action and reward in the history dataset. """ # step 3: assume that state and action have been set assert self.lastobs != None assert self.lastaction != None assert self.lastreward == None self.lastreward = r # store state, action and reward in dataset if logging is enabled if self.logging: self.history.addSample(self.lastobs, self.lastaction, self.lastreward) def newEpisode(self): """ Indicate the beginning of a new episode in the training cycle. """ if self.logging: self.history.newSequence() def reset(self): """ Clear the history of the agent. """ self.lastobs = None self.lastaction = None self.lastreward = None self.history.clear()
class RWR(DirectSearchLearner): """ Reward-weighted regression. The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks. """ # parameters batchSize = 20 # feedback settings verbose = True greedyRuns = 20 supervisedPlotting = False # settings for the supervised training learningRate = 0.005 momentum = 0.9 maxEpochs = 20 validationProportion = 0.33 continueEpochs = 2 # parameters for the variation that uses a value function # TODO: split into 2 classes. valueLearningRate = None valueMomentum = None #valueTrainEpochs = 5 resetAllWeights = False netweights = 0.01 def __init__(self, net, task, valueNetwork=None, **args): self.net = net self.task = task self.setArgs(**args) if self.valueLearningRate == None: self.valueLearningRate = self.learningRate if self.valueMomentum == None: self.valueMomentum = self.momentum if self.supervisedPlotting: from pylab import ion ion() # adaptive temperature: self.tau = 1. # prepare the datasets to be used self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim) self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim) self.valueDs = SequentialDataSet(self.task.outdim, 1) # prepare the supervised trainers self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate, self.momentum, verbose=False, batchlearning=True) # CHECKME: outsource self.vnet = valueNetwork if valueNetwork != None: self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate, self.valueMomentum, verbose=self.verbose) # keep information: self.totalSteps = 0 self.totalEpisodes = 0 def shapingFunction(self, R): return exp(self.tau * R) def updateTau(self, R, U): self.tau = sum(U) / dot((R - self.task.minReward), U) def reset(self): self.weightedDs.clear() self.valueDs.clear() self.rawDs.clear() self.bp.momentumvector *= 0.0 if self.vnet != None: self.vbp.momentumvector *= 0.0 if self.resetAllWeights: self.vnet.params[:] = randn(len( self.vnet.params)) * self.netweights def greedyEpisode(self): """ run one episode with greedy decisions, return the list of rewards recieved.""" rewards = [] self.task.reset() self.net.reset() while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = argmax(act) self.task.performAction(chosen) reward = self.task.getReward() rewards.append(reward) return rewards def learn(self, batches): self.greedyAvg = [] self.rewardAvg = [] self.lengthAvg = [] self.initr0Avg = [] for b in range(batches): if self.verbose: print() print(('Batch', b + 1)) self.reset() self.learnOneBatch() self.totalEpisodes += self.batchSize # greedy measure (avg over some greedy runs) rws = 0. for dummy in range(self.greedyRuns): tmp = self.greedyEpisode() rws += (sum(tmp) / float(len(tmp))) self.greedyAvg.append(rws / self.greedyRuns) if self.verbose: print(('::', round(rws / self.greedyRuns, 5), '::')) def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print(( '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', )) print(('avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')')) print(lens) # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print('Old value error: ', self.vbp.testOnData()) # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print('New value error: ', self.vbp.testOnData()) # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence( maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
class RWR(DirectSearchLearner): """ Reward-weighted regression. The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks. """ # parameters batchSize = 20 # feedback settings verbose = True greedyRuns = 20 supervisedPlotting = False # settings for the supervised training learningRate = 0.005 momentum = 0.9 maxEpochs = 20 validationProportion = 0.33 continueEpochs = 2 # parameters for the variation that uses a value function # TODO: split into 2 classes. valueLearningRate = None valueMomentum = None #valueTrainEpochs = 5 resetAllWeights = False netweights = 0.01 def __init__(self, net, task, valueNetwork=None, **args): self.net = net self.task = task self.setArgs(**args) if self.valueLearningRate == None: self.valueLearningRate = self.learningRate if self.valueMomentum == None: self.valueMomentum = self.momentum if self.supervisedPlotting: from pylab import ion ion() # adaptive temperature: self.tau = 1. # prepare the datasets to be used self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim) self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim) self.valueDs = SequentialDataSet(self.task.outdim, 1) # prepare the supervised trainers self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate, self.momentum, verbose=False, batchlearning=True) # CHECKME: outsource self.vnet = valueNetwork if valueNetwork != None: self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate, self.valueMomentum, verbose=self.verbose) # keep information: self.totalSteps = 0 self.totalEpisodes = 0 def shapingFunction(self, R): return exp(self.tau * R) def updateTau(self, R, U): self.tau = sum(U) / dot((R - self.task.minReward), U) def reset(self): self.weightedDs.clear() self.valueDs.clear() self.rawDs.clear() self.bp.momentumvector *= 0.0 if self.vnet != None: self.vbp.momentumvector *= 0.0 if self.resetAllWeights: self.vnet.params[:] = randn(len(self.vnet.params)) * self.netweights def greedyEpisode(self): """ run one episode with greedy decisions, return the list of rewards recieved.""" rewards = [] self.task.reset() self.net.reset() while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = argmax(act) self.task.performAction(chosen) reward = self.task.getReward() rewards.append(reward) return rewards def learn(self, batches): self.greedyAvg = [] self.rewardAvg = [] self.lengthAvg = [] self.initr0Avg = [] for b in range(batches): if self.verbose: print print 'Batch', b + 1 self.reset() self.learnOneBatch() self.totalEpisodes += self.batchSize # greedy measure (avg over some greedy runs) rws = 0. for dummy in range(self.greedyRuns): tmp = self.greedyEpisode() rws += (sum(tmp) / float(len(tmp))) self.greedyAvg.append(rws / self.greedyRuns) if self.verbose: print '::', round(rws / self.greedyRuns, 5), '::' def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', print 'avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')' print lens # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print 'Old value error: ', self.vbp.testOnData() # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print 'New value error: ', self.vbp.testOnData() # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence(maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
class LoggingAgent(Agent): """ This agent stores actions, states, and rewards encountered during interaction with an environment in a ReinforcementDataSet (which is a variation of SequentialDataSet). The stored history can be used for learning and is erased by resetting the agent. It also makes sure that integrateObservation, getAction and giveReward are called in exactly that order. """ logging = True lastobs = None lastaction = None lastreward = None def __init__(self, indim, outdim, **kwargs): self.setArgs(**kwargs) # store input and output dimension self.indim = indim self.outdim = outdim # create the history dataset self.history = ReinforcementDataSet(indim, outdim) def integrateObservation(self, obs): """Step 1: store the observation received in a temporary variable until action is called and reward is given. """ self.lastobs = obs self.lastaction = None self.lastreward = None def getAction(self): """Step 2: store the action in a temporary variable until reward is given. """ assert self.lastobs != None assert self.lastaction == None assert self.lastreward == None # implement getAction in subclass and set self.lastaction def giveReward(self, r): """Step 3: store observation, action and reward in the history dataset. """ # step 3: assume that state and action have been set assert self.lastobs != None assert self.lastaction != None assert self.lastreward == None self.lastreward = r # store state, action and reward in dataset if logging is enabled if self.logging: self.history.addSample(self.lastobs, self.lastaction, self.lastreward) def newEpisode(self): """ Indicate the beginning of a new episode in the training cycle. """ if self.logging: self.history.newSequence() def reset(self): """ Clear the history of the agent. """ self.lastobs = None self.lastaction = None self.lastreward = None self.history.clear() def _reset(self): self.lastobs = None self.lastaction = None self.lastreward = None