def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [64] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 varianceDiscount = .99995 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 0.1 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) xBoundary = (0, 5) yBoundary = (0, 5) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 1 sheepTerminalPenalty = 100 killzoneRadius = 0.1 isBoundaryTerminal = IsBoundaryTerminal(xBoundary, yBoundary, getSheepPos) isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius, isBoundaryTerminal) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) actionCostRate = 0.5 getActionCost = GetActionCost(actionCostRate) getReward = RewardWithActionCost(rewardSheep, getActionCost) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) # resetSheepOnly = Reset(xBoundary, yBoundary, numOfAgent = 1) # reset = lambda: list(resetSheepOnly()) +[1, 1] reset = Reset(xBoundary, yBoundary, numAgents) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'wolfSpeed': wolfSpeed, 'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'learningRate': learningRateActor, 'noiseInitVar': noiseInitVariance, 'decay': varianceDiscount, 'gridSize': xBoundary[1] } modelSaveDirectory = "../../trainedDDPGModels/wolfAvoidBoundaryActionCost/resetBoth/smallBoundary" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def testActionCost(self, action, trueCost): getActionCost = GetActionCost(self.actionCostRate) cost = getActionCost(action) self.assertEqual(cost, trueCost)
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) dirName = os.path.dirname(__file__) actorModelPath = os.path.join( dirName, '..', '..', 'trainedDDPGModels', 'noWolfAvoidBoundary', 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.001_learningRateCritic=0.001_maxEpisode=1000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.ckpt' ) restoreVariables(actorModel, actorModelPath) sheepPolicy = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise=None) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfPolicy = lambda state: (0, 0) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) transit = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) sheepAliveBonus = 1 sheepTerminalPenalty = 20 killzoneRadius = 0 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) actionCostRate = 1 getActionCost = GetActionCost(actionCostRate) rewardWithActionCost = RewardWithActionCost(rewardSheep, getActionCost) getSheepAction = lambda actions: [ actions[sheepId * actionDim], actions[sheepId * actionDim + 1] ] getReward = lambda state, action, nextState: rewardWithActionCost( state, getSheepAction(action), nextState) policy = lambda state: list(sheepPolicy(state)) + list(wolfPolicy(state)) xBoundary = [0, 0.5] yBoundary = [0, 0.5] reset = Reset(xBoundary, yBoundary, numAgents) # reset = lambda: np.array([19.9, 10, 15, 5]) for i in range(10): sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, getReward, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: observe = Observe(trajectory, numAgents) fullScreen = False screenWidth = 800 screenHeight = 800 screen = initializeScreen(fullScreen, screenWidth, screenHeight) leaveEdgeSpace = 200 lineWidth = 3 xBoundary = [leaveEdgeSpace, screenWidth - leaveEdgeSpace * 2] yBoundary = [leaveEdgeSpace, screenHeight - leaveEdgeSpace * 2] screenColor = THECOLORS['black'] lineColor = THECOLORS['white'] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth) circleSize = 10 positionIndex = [0, 1] drawState = DrawState(screen, circleSize, positionIndex, drawBackground) numberOfAgents = 2 chasingColors = [THECOLORS['green'], THECOLORS['red']] colorSpace = chasingColors[:numberOfAgents] FPS = 60 chaseTrial = ChaseTrialWithTraj(FPS, colorSpace, drawState, saveImage=True) rawXRange = [0, 20] rawYRange = [0, 20] scaledXRange = [210, 590] scaledYRange = [210, 590] scaleTrajectory = ScaleTrajectory(positionIndex, rawXRange, rawYRange, scaledXRange, scaledYRange) oldFPS = 5 adjustFPS = AdjustDfFPStoTraj(oldFPS, FPS) getTrajectory = lambda rawTrajectory: scaleTrajectory( adjustFPS(rawTrajectory)) positionList = [observe(index) for index in range(len(trajectory))] positionListToDraw = getTrajectory(positionList) currentDir = os.getcwd() parentDir = os.path.abspath(os.path.join(currentDir, os.pardir)) imageFolderName = 'Demo' saveImageDir = os.path.join(os.path.join(parentDir, 'chasingDemo'), imageFolderName) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) chaseTrial(numberOfAgents, positionListToDraw, saveImageDir)