def performAction(self, action): """ POMDP tasks, as they have discrete actions, can me used by providing either an index, or an array with a 1-in-n coding (which can be stochastic). """ if type(action) == ndarray: action = drawIndex(action, tolerant = True) self.steps += 1 EpisodicTask.performAction(self, action)
def _produceNewSample(self): """ returns a new sample, its fitness and its densities """ chosenOne = drawIndex(self.alphas, True) mu = self.mus[chosenOne] if self.useAnticipatedMeanShift: if len(self.allsamples) % 2 == 1 and len(self.allsamples) > 1: if not (self.elitism and chosenOne == self.bestChosenCenter): mu += self.meanShifts[chosenOne] if self.diagonalOnly: sample = normal(mu, self.sigmas[chosenOne]) else: sample = multivariate_normal(mu, self.sigmas[chosenOne]) if self.sampleElitism and len( self.allsamples) > self.windowSize and len( self.allsamples) % self.windowSize == 0: sample = self.bestEvaluable.copy() fit = self._oneEvaluation(sample) if ((not self.minimize and fit >= self.bestEvaluation) or (self.minimize and fit <= self.bestEvaluation) or len(self.allsamples) == 0): # used to determine which center produced the current best self.bestChosenCenter = chosenOne self.bestSigma = self.sigmas[chosenOne].copy() if self.minimize: fit = -fit self.allfitnesses.append(fit) self.allsamples.append(sample) return sample, fit
def performAction(self, action, onlyavatar=False): """ Action is an index for the actionset. """ if action is None: return # if actions are given as a vector, pick the argmax import numpy from scipy import argmax from pybrain3.utilities import drawIndex if isinstance(action, numpy.ndarray): if abs(sum(action) - 1) < 1e5: # vector represents probabilities action = drawIndex(action) else: action = argmax(action) # take action and compute consequences if self._avatar: self._avatar._readMultiActions = lambda *x: [ self._actionset[action] ] self._game._clearAll(self.visualize) # update sprites if onlyavatar: self._avatar.update(self._game) else: for s in self._game: s.update(self._game) # handle collision effects self._game._updateCollisionDict() self._game._eventHandling() self._game._clearAll(self.visualize) # update screen if self.visualize: self._game._drawAll() pygame.display.update(VGDLSprite.dirtyrects) VGDLSprite.dirtyrects = [] pygame.time.wait(self.actionDelay) if self.recordingEnabled: self._previous_state = self._last_state self._last_state = self.getState() self._allEvents.append( (self._previous_state, action, self._last_state))
def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print('***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', end=' ') print('avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')') print(lens) # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print 'Old value error: ', self.vbp.testOnData() # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print 'New value error: ', self.vbp.testOnData() # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence( maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0