class DemonExperiment(object): Latency = 100 #s def __init__(self): self.environment = CritterbotSimulator() self.latencyTimer = Chrono() self.rewards = self.createRewardFunction() self.actions = XYThetaAction.sevenActions() self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000) self.representation.includeActiveFeature() self.demons = DemonScheduler() for rewardFunction in self.rewards: self.demons.add(self.createOffPolicyControlDemon(rewardFunction)) self.x_t = None def createRewardFunction(self): legend = self.environment.legend() return [ SensorRewardFunction(legend, 'MotorCurrent0'), SensorRewardFunction(legend, 'MotorCurrent1'), SensorRewardFunction(legend, 'MotorCurrent2') ] def createOffPolicyControlDemon(self, rewardFunction): toStateAction = TabularAction(self.actions, self.representation.vectorSize()) nbFeatures = toStateAction.actionStateFeatureSize() lambda_ = 0.1 beta = .1 alpha_v = .1 / self.representation.nbActive() alpha_w = .1 / self.representation.nbActive() gq = GQ(alpha_v, alpha_w, beta , lambda_, nbFeatures) targetPolicy = Greedy(gq, self.actions, toStateAction) controlGQ = ExpectedGQ(gq, self.actions, toStateAction, targetPolicy, self.behaviourPolicy) return ControlOffPolicyDemon(rewardFunction, controlGQ) def learn(self, a_t, o_tp1): for rewardFunction in self.rewards: rewardFunction.update(o_tp1) x_tp1 = self.representation.project(o_tp1) self.demons.update(self.x_t, a_t, x_tp1) self.x_t = x_tp1 def run(self): a_t = None while not self.environment.isClosed(): self.latencyTimer.start() o_tp1 = self.environment.waitNewObs() self.learn(a_t, o_tp1) a_tp1 = self.behaviourPolicy.decide(None) self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) def zephyrize(self): clock = self.environment.clock() zepy.advertise(self.environment, clock) zepy.advertise(self.demons, clock) for rewardFunction in self.rewards: zepy.monattr(rewardFunction, 'rewardValue', clock = clock, label = rewardFunction.label)
class DemonExperiment(object): Latency = 100 #s def __init__(self): command = CritterbotSimulator.startSimulator() self.environment = CritterbotSimulator(command) self.latencyTimer = Chrono() self.rewards = self.createRewardFunction() self.actions = XYThetaAction.sevenActions() self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000) self.representation.includeActiveFeature() self.demons = [] for rewardFunction in self.rewards: demon = self.createOnPolicyPredictionDemon(rewardFunction) self.demons.append(demon) self.horde = Horde() self.horde.demons().addAll(self.demons) self.horde.beforeFunctions().addAll(self.rewards) self.x_t = None self.clock = zepy.clock("Nexting Clock") def createRewardFunction(self): legend = self.environment.legend() return list(SensorRewardFunction(legend, label) for label in legend.getLabels()) def createOnPolicyPredictionDemon(self, rewardFunction): gamma = .9 alpha = .1 / self.representation.vectorNorm() nbFeatures = self.representation.vectorSize() lambda_= .3 td = TDLambda(lambda_, gamma, alpha, nbFeatures) return PredictionDemon(rewardFunction, td) def learn(self, a_t, o_tp1): x_tp1 = self.representation.project(o_tp1.doubleValues()) self.horde.update(o_tp1, self.x_t, a_t, x_tp1) self.x_t = Vectors.bufferedCopy(x_tp1, self.x_t) def run(self): a_t = None while self.clock.tick(): self.latencyTimer.start() o_tp1 = self.environment.waitNewRawObs() self.learn(a_t, o_tp1) self.behaviourPolicy.update(None) a_tp1 = self.behaviourPolicy.sampleAction() self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) self.environment.close() def zephyrize(self): zepy.advertise(self.clock, self.environment) zepy.advertise(self.clock, self.horde) for rewardFunction in self.rewards: zepy.monattr(self.clock, rewardFunction, 'rewardValue', label = rewardFunction.label)
class DemonExperiment(object): Latency = 100 #s def __init__(self): self.environment = CritterbotSimulator() self.latencyTimer = Chrono() self.rewards = self.createRewardFunction() self.actions = XYThetaAction.sevenActions() self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000) self.representation.includeActiveFeature() self.demons = DemonScheduler() for rewardFunction in self.rewards: targetPolicy = SingleActionPolicy(XYThetaAction.Left) demon = self.createOffPolicyPredictionDemon(rewardFunction, targetPolicy) self.demons.add(demon) self.x_t = None def createRewardFunction(self): legend = self.environment.legend() return [ SensorRewardFunction(legend, 'MotorCurrent0'), SensorRewardFunction(legend, 'MotorCurrent1'), SensorRewardFunction(legend, 'MotorCurrent2') ] def createOffPolicyPredictionDemon(self, rewardFunction, targetPolicy): gamma = .9 alpha_v = .1 / self.representation.nbActive() alpha_w = .1 / self.representation.nbActive() nbFeatures = self.representation.vectorSize() gtd = GTD(gamma, alpha_v, alpha_w, nbFeatures) return PredictionOffPolicyDemon(rewardFunction, gtd, targetPolicy, self.behaviourPolicy) def learn(self, a_t, o_tp1): for rewardFunction in self.rewards: rewardFunction.update(o_tp1) x_tp1 = self.representation.project(o_tp1) self.demons.update(self.x_t, a_t, x_tp1) self.x_t = x_tp1 def run(self): a_t = None while not self.environment.isClosed(): self.latencyTimer.start() o_tp1 = self.environment.waitNewObs() self.learn(a_t, o_tp1) a_tp1 = self.behaviourPolicy.decide(None) self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) def zephyrize(self): clock = self.environment.clock() zepy.advertise(self.environment, clock) zepy.advertise(self.demons, clock) for rewardFunction in self.rewards: zepy.monattr(rewardFunction, 'rewardValue', clock = clock, label = rewardFunction.label)
class DemonExperiment(object): Latency = 100 #s def __init__(self): self.environment = CritterbotSimulator() self.latencyTimer = Chrono() self.rewards = self.createRewardFunction() self.actions = XYThetaAction.sevenActions() self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000) self.representation.includeActiveFeature() self.demons = DemonScheduler() for rewardFunction in self.rewards: demon = self.createOnPolicyPredictionDemon(rewardFunction) self.demons.add(demon) self.x_t = None def createRewardFunction(self): legend = self.environment.legend() return list(SensorRewardFunction(legend, label) for label in legend.getLabels()) def createOnPolicyPredictionDemon(self, rewardFunction): gamma = .9 alpha = .1 / self.representation.nbActive() nbFeatures = self.representation.vectorSize() lambda_= .3 td = TDLambda(lambda_, gamma, alpha, nbFeatures) return PredictionDemon(rewardFunction, td) def learn(self, a_t, o_tp1): for rewardFunction in self.rewards: rewardFunction.update(o_tp1) x_tp1 = self.representation.project(o_tp1) self.demons.update(self.x_t, a_t, x_tp1) self.x_t = x_tp1 def run(self): a_t = None while not self.environment.isClosed(): self.latencyTimer.start() o_tp1 = self.environment.waitNewObs() self.learn(a_t, o_tp1) a_tp1 = self.behaviourPolicy.decide(None) self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) def zephyrize(self): clock = self.environment.clock() zepy.advertise(self.environment, clock) zepy.advertise(self.demons, clock) for rewardFunction in self.rewards: zepy.monattr(rewardFunction, 'rewardValue', clock = clock, label = rewardFunction.label)
class DemonExperiment(object): Latency = 100 #s def __init__(self): command = CritterbotSimulator.startSimulator() self.environment = CritterbotSimulator(command) self.latencyTimer = Chrono() self.rewards = self.createRewardFunction() self.actions = XYThetaAction.sevenActions() self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000) self.representation.includeActiveFeature() self.demons = [] for rewardFunction in self.rewards: self.demons.append(self.createOffPolicyControlDemon(rewardFunction)) self.horde = Horde() self.horde.demons().addAll(self.demons) self.horde.beforeFunctions().addAll(self.rewards) self.x_t = None self.clock = zepy.clock("Horde Off-policy Control demons") def createRewardFunction(self): legend = self.environment.legend() return [ SensorRewardFunction(legend, 'MotorCurrent0'), SensorRewardFunction(legend, 'MotorCurrent1'), SensorRewardFunction(legend, 'MotorCurrent2') ] def createOffPolicyControlDemon(self, rewardFunction): toStateAction = TabularAction(self.actions, self.representation.vectorNorm(), self.representation.vectorSize()) nbFeatures = toStateAction.vectorSize() lambda_ = 0.1 beta = .1 alpha_v = .1 / toStateAction.vectorNorm() alpha_w = .001 / toStateAction.vectorNorm() gq = GQ(alpha_v, alpha_w, beta , lambda_, nbFeatures) targetPolicy = Greedy(gq, self.actions, toStateAction) controlGQ = GreedyGQ(gq, self.actions, toStateAction, targetPolicy, self.behaviourPolicy) return ControlOffPolicyDemon(rewardFunction, controlGQ) def learn(self, a_t, o_tp1): x_tp1 = self.representation.project(o_tp1.doubleValues()) self.horde.update(o_tp1, self.x_t, a_t, x_tp1) self.x_t = Vectors.bufferedCopy(x_tp1, self.x_t) def run(self): a_t = None while self.clock.tick(): self.latencyTimer.start() o_tp1 = self.environment.waitNewRawObs() self.learn(a_t, o_tp1) self.behaviourPolicy.update(None) a_tp1 = self.behaviourPolicy.sampleAction() self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) self.environment.close() def zephyrize(self): zepy.advertise(self.clock, self.environment) zepy.advertise(self.clock, self.horde) for rewardFunction in self.rewards: zepy.monattr(self.clock, rewardFunction, 'rewardValue', label = rewardFunction.label)
class DemonExperiment(LabeledCollection): Latency = 100 # milliseconds HistoryLength = 10 sensorsOfInterest = ["CliffSignalLeft", "CliffSignalFrontLeft", "CliffSignalFrontRight", "CliffSignalRight"] demonToData = {} def __init__(self): self.logfile = TimedFileLogger("/tmp/log.crtrlog") self.environment = CreateRobot() self.environment.fullMode() self.latencyTimer = Chrono() self.clock = Clock("CreateNexting") self.rewards = self.createRewardFunction() self.actions = [CreateAction(-200, +200)] self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.obsHistory = ObsHistory(10, self.environment.legend()) self.representation = TileCodersNoHashing(self.obsHistory.historyVectorSize(), 0, 4096) self.representation.includeActiveFeature() for name in self.sensorsOfInterest: for timeShift in range(self.HistoryLength): indexes = self.obsHistory.selectIndexes(timeShift, name) self.representation.addTileCoder(indexes, 64, 8) self.demons = DemonScheduler() self.verifiers = [] for rewardFunction in self.rewards: for gamma in [0, 0.5, 0.75, 7 / 8.0, 15 / 16.0]: demon = self.createOnPolicyPredictionDemon(rewardFunction, gamma) verifier = PredictionDemonVerifier(demon) self.verifiers.append(verifier) self.demons.add(demon) self.demonToData[demon] = (verifier, rewardFunction.label() + str(gamma)) self.x_t = None def label(self, index): return self.demons.demons().get(index).label() def createRewardFunction(self): legend = self.environment.legend() return list(SensorRewardFunction(legend, label) for label in self.sensorsOfInterest) def createOnPolicyPredictionDemon(self, rewardFunction, gamma): alpha = 0.1 / self.representation.nbActive() nbFeatures = self.representation.vectorSize() lambda_ = 1.0 td = TDLambda(lambda_, gamma, alpha, nbFeatures) return PredictionDemon(rewardFunction, td) def learn(self, a_t, o_tp1): for rewardFunction in self.rewards: rewardFunction.update(o_tp1) ho_tp1 = self.obsHistory.update(o_tp1) x_tp1 = self.representation.project(ho_tp1) self.demons.update(self.x_t, a_t, x_tp1) for verifier in self.verifiers: verifier.update(False) self.x_t = x_tp1 def run(self): a_t = None while not self.environment.isClosed(): self.clock.tick() self.logfile.update() self.latencyTimer.start() o_tp1 = self.environment.waitNewObs() self.learn(a_t, o_tp1) a_tp1 = self.behaviourPolicy.decide(None) self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.0) def zephyrize(self): Zephyr.registerLabeledCollection(self, "demons", "") monitoredList = [self.verifiers, self.environment, self.demons] for monitored in monitoredList: zepy.advertise(monitored, self.clock) self.logfile.add(monitored, 0) monitor = ZephyrPlotting.createMonitor(self.clock) for rewardFunction in self.rewards: zepy.monattr(rewardFunction, "rewardValue", clock=self.clock, label=rewardFunction.label())
class DemonExperiment(LabeledCollection): Latency = 100 #milliseconds HistoryLength = 10 sensorsOfInterest = [ "CliffSignalLeft", "CliffSignalFrontLeft", "CliffSignalFrontRight", "CliffSignalRight" ] demonToData = {} def __init__(self): self.logfile = TimedFileLogger('/tmp/log.crtrlog') self.environment = CreateRobot() self.environment.fullMode() self.latencyTimer = Chrono() self.clock = Clock("CreateNexting") self.rewards = self.createRewardFunction() self.actions = [CreateAction(-200, +200)] self.behaviourPolicy = RandomPolicy(Random(0), self.actions) self.obsHistory = ObsHistory(10, self.environment.legend()) self.representation = TileCodersNoHashing( self.obsHistory.historyVectorSize(), 0, 4096) self.representation.includeActiveFeature() for name in self.sensorsOfInterest: for timeShift in range(self.HistoryLength): indexes = self.obsHistory.selectIndexes(timeShift, name) self.representation.addTileCoder(indexes, 64, 8) self.demons = DemonScheduler() self.verifiers = [] for rewardFunction in self.rewards: for gamma in [0, 0.5, 0.75, 7 / 8., 15 / 16.]: demon = self.createOnPolicyPredictionDemon( rewardFunction, gamma) verifier = PredictionDemonVerifier(demon) self.verifiers.append(verifier) self.demons.add(demon) self.demonToData[demon] = (verifier, rewardFunction.label() + str(gamma)) self.x_t = None def label(self, index): return self.demons.demons().get(index).label() def createRewardFunction(self): legend = self.environment.legend() return list( SensorRewardFunction(legend, label) for label in self.sensorsOfInterest) def createOnPolicyPredictionDemon(self, rewardFunction, gamma): alpha = .1 / self.representation.nbActive() nbFeatures = self.representation.vectorSize() lambda_ = 1.0 td = TDLambda(lambda_, gamma, alpha, nbFeatures) return PredictionDemon(rewardFunction, td) def learn(self, a_t, o_tp1): for rewardFunction in self.rewards: rewardFunction.update(o_tp1) ho_tp1 = self.obsHistory.update(o_tp1) x_tp1 = self.representation.project(ho_tp1) self.demons.update(self.x_t, a_t, x_tp1) for verifier in self.verifiers: verifier.update(False) self.x_t = x_tp1 def run(self): a_t = None while not self.environment.isClosed(): self.clock.tick() self.logfile.update() self.latencyTimer.start() o_tp1 = self.environment.waitNewObs() self.learn(a_t, o_tp1) a_tp1 = self.behaviourPolicy.decide(None) self.environment.sendAction(a_tp1) a_t = a_tp1 waitingTime = self.Latency - self.latencyTimer.getCurrentMillis() if waitingTime > 0: time.sleep(waitingTime / 1000.) def zephyrize(self): Zephyr.registerLabeledCollection(self, "demons", "") monitoredList = [self.verifiers, self.environment, self.demons] for monitored in monitoredList: zepy.advertise(monitored, self.clock) self.logfile.add(monitored, 0) monitor = ZephyrPlotting.createMonitor(self.clock) for rewardFunction in self.rewards: zepy.monattr(rewardFunction, 'rewardValue', clock=self.clock, label=rewardFunction.label())