def setUp(self): self.sheepId = 0 self.wolfId = 1 self.getSheepXPos = GetAgentPosFromState(self.sheepId) self.getWolfXPos = GetAgentPosFromState(self.wolfId) self.actionMagnitude = 1 self.wolfPolicy = HeatSeekingContinuousDeterministicPolicy( self.getWolfXPos, self.getSheepXPos, self.actionMagnitude)
def testTerminal(self, state, trueTerminal): sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) isBoundaryTerminal = IsBoundaryTerminal(self.xBoundary, self.yBoundary, getSheepPos) killzoneRadius = 1 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius, isBoundaryTerminal) terminal = isTerminal(state) self.assertEqual(terminal, trueTerminal)
def setUp(self): sheepId = 0 wolfId = 1 getSheepXPos = GetAgentPosFromState(sheepId) getWolfXPos = GetAgentPosFromState(wolfId) killzoneRadius = 1 isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius) maxRunningSteps = 20 sheepAliveBonus = 1 / maxRunningSteps sheepTerminalPenalty = -1 self.rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal)
def testBoundaryTerminal(self, state, trueTerminal): sheepID = 0 getSheepPos = GetAgentPosFromState(sheepID) isBoundaryTerminal = IsBoundaryTerminal(self.xBoundary, self.yBoundary, getSheepPos) terminal = isBoundaryTerminal(state) self.assertEqual(terminal, trueTerminal)
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) dirName = os.path.dirname(__file__) actorModelPath = os.path.join( dirName, '..', 'trainedDDPGModels', 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.001_learningRateCritic=0.001_maxEpisode=2000_maxTimeStep=20_minibatchSize=32_wolfSpeed=0.5.ckpt' ) # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=400_maxTimeStep=100_minibatchSize=32_wolfSpeed=1.ckpt') # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=1000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt') # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=5000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt') restoreVariables(actorModel, actorModelPath) sheepPolicy = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise=None) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 0.5 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) transit = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) # transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) maxTimeStep = 20 sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = 20 killzoneRadius = 0 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) rewardSheepWithBoundaryHeuristics = RewardSheepWithBoundaryHeuristics( rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos) getSheepAction = lambda actions: [ actions[sheepId * actionDim], actions[sheepId * actionDim + 1] ] getReward = lambda state, action, nextState: rewardSheepWithBoundaryHeuristics( state, getSheepAction(action), nextState) policy = lambda state: list(sheepPolicy(state)) + list(wolfPolicy(state)) # reset = Reset(xBoundary, yBoundary, numAgents) reset = lambda: np.array([10, 10, 15, 5]) for i in range(10): maxRunningSteps = 50 sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, getReward, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: observe = Observe(trajectory, numAgents) fullScreen = False screenWidth = 800 screenHeight = 800 screen = initializeScreen(fullScreen, screenWidth, screenHeight) leaveEdgeSpace = 200 lineWidth = 3 xBoundary = [leaveEdgeSpace, screenWidth - leaveEdgeSpace * 2] yBoundary = [leaveEdgeSpace, screenHeight - leaveEdgeSpace * 2] screenColor = THECOLORS['black'] lineColor = THECOLORS['white'] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth) circleSize = 10 positionIndex = [0, 1] drawState = DrawState(screen, circleSize, positionIndex, drawBackground) numberOfAgents = 2 chasingColors = [THECOLORS['green'], THECOLORS['red']] colorSpace = chasingColors[:numberOfAgents] FPS = 60 chaseTrial = ChaseTrialWithTraj(FPS, colorSpace, drawState, saveImage=True) rawXRange = [0, 20] rawYRange = [0, 20] scaledXRange = [210, 590] scaledYRange = [210, 590] scaleTrajectory = ScaleTrajectory(positionIndex, rawXRange, rawYRange, scaledXRange, scaledYRange) oldFPS = 5 adjustFPS = AdjustDfFPStoTraj(oldFPS, FPS) getTrajectory = lambda rawTrajectory: scaleTrajectory( adjustFPS(rawTrajectory)) positionList = [observe(index) for index in range(len(trajectory))] positionListToDraw = getTrajectory(positionList) currentDir = os.getcwd() parentDir = os.path.abspath(os.path.join(currentDir, os.pardir)) imageFolderName = 'Demo' saveImageDir = os.path.join(os.path.join(parentDir, 'chasingDemo'), imageFolderName) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) chaseTrial(numberOfAgents, positionListToDraw, saveImageDir)
def testGetAgentPos(self, agentID, state, truePos): getAgentPosFromState = GetAgentPosFromState(agentID) pos = getAgentPosFromState(state) self.assertEqual(pos, truePos)
def __call__(self, df): varianceDiscount = df.index.get_level_values('varianceDiscount')[0] bufferSize = df.index.get_level_values('bufferSize')[0] layerWidth = df.index.get_level_values('layerWidth')[0] print('buffer: ', bufferSize, ', layers: ', layerWidth, ', varDiscount: ', varianceDiscount) buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorWriter, actorModel = buildActorModel(layerWidth) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticWriter, criticModel = buildCriticModel(layerWidth) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients( learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepXPos = GetAgentPosFromState(sheepId) getWolfXPos = GetAgentPosFromState(wolfId) wolfSpeed = 2 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfXPos, getSheepXPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 0 / maxTimeStep sheepTerminalPenalty = -20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepXPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) reset = Reset(xBoundary, yBoundary, numAgents) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) timeStep = list(range(len(meanRewardList))) resultSe = pd.Series( {time: reward for time, reward in zip(timeStep, meanRewardList)}) return resultSe
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [64] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 varianceDiscount = .9995 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 1 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) # wolfPolicy = lambda state: (0, 0) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = 20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) # reset = Reset(xBoundary, yBoundary, numAgents) # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.] # reset = lambda: np.array([15, 10, 10, 10]) reset = lambda: np.array([10, 10, 15, 5]) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'wolfSpeed': wolfSpeed, 'dimension': actionDim, 'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma, 'learningRateActor': learningRateActor, 'learningRateCritic': learningRateCritic } modelSaveDirectory = "../trainedDDPGModels" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()