def testReset(self, xBoundary, yBoundary, numOfAgent): reset = Reset(xBoundary, yBoundary, numOfAgent) state = reset() stateArray = np.array(state) self.assertEqual(stateArray.shape, (numOfAgent * 2, ))
def __call__(self, df): varianceDiscount = df.index.get_level_values('varianceDiscount')[0] bufferSize = df.index.get_level_values('bufferSize')[0] layerWidth = df.index.get_level_values('layerWidth')[0] print('buffer: ', bufferSize, ', layers: ', layerWidth, ', varDiscount: ', varianceDiscount) buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorWriter, actorModel = buildActorModel(layerWidth) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticWriter, criticModel = buildCriticModel(layerWidth) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients( learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepXPos = GetAgentPosFromState(sheepId) getWolfXPos = GetAgentPosFromState(wolfId) wolfSpeed = 2 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfXPos, getSheepXPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 0 / maxTimeStep sheepTerminalPenalty = -20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepXPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) reset = Reset(xBoundary, yBoundary, numAgents) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) timeStep = list(range(len(meanRewardList))) resultSe = pd.Series( {time: reward for time, reward in zip(timeStep, meanRewardList)}) return resultSe
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow)/2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) dirName = os.path.dirname(__file__) actorModelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=1000_maxTimeStep=50_minibatchSize=32_wolfSpeed=3.ckpt') restoreVariables(actorModel, actorModelPath) sheepPolicy = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise = None) sheepId = 0 wolfId = 1 getSheepXPos = GetAgentPosFromState(sheepId) getWolfXPos = GetAgentPosFromState(wolfId) wolfSpeed = 3 wolfPolicy = HeatSeekingContinuousDeterministicPolicy(getWolfXPos, getSheepXPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) maxTimeStep = 50 sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = -10 killzoneRadius = 1 isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius) rewardFunc = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) policy = lambda state: np.array(sheepPolicy(state)) + np.array(wolfPolicy(state)) reset = Reset(xBoundary, yBoundary, numAgents) for i in range(20): maxRunningSteps = 50 sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, rewardFunc, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: observe = Observe(trajectory, numAgents) fullScreen = False screenWidth = 800 screenHeight = 800 screen = initializeScreen(fullScreen, screenWidth, screenHeight) leaveEdgeSpace = 200 lineWidth = 3 xBoundary = [leaveEdgeSpace, screenWidth - leaveEdgeSpace * 2] yBoundary = [leaveEdgeSpace, screenHeight - leaveEdgeSpace * 2] screenColor = THECOLORS['black'] lineColor = THECOLORS['white'] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth) circleSize = 10 positionIndex = [0, 1] drawState = DrawState(screen, circleSize, positionIndex, drawBackground) numberOfAgents = 2 chasingColors = [THECOLORS['green'], THECOLORS['red']] colorSpace = chasingColors[: numberOfAgents] FPS = 60 chaseTrial = ChaseTrialWithTraj(FPS, colorSpace, drawState, saveImage=True) rawXRange = [0, 20] rawYRange = [0, 20] scaledXRange = [210, 590] scaledYRange = [210, 590] scaleTrajectory = ScaleTrajectory(positionIndex, rawXRange, rawYRange, scaledXRange, scaledYRange) oldFPS = 5 adjustFPS = AdjustDfFPStoTraj(oldFPS, FPS) getTrajectory = lambda rawTrajectory: scaleTrajectory(adjustFPS(rawTrajectory)) positionList = [observe(index) for index in range(len(trajectory))] positionListToDraw = getTrajectory(positionList) currentDir = os.getcwd() parentDir = os.path.abspath(os.path.join(currentDir, os.pardir)) imageFolderName = 'Demo' saveImageDir = os.path.join(os.path.join(parentDir, 'chasingDemo'), imageFolderName) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) chaseTrial(numberOfAgents, positionListToDraw, saveImageDir)
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [64] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 varianceDiscount = .99995 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 0.1 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) xBoundary = (0, 5) yBoundary = (0, 5) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 1 sheepTerminalPenalty = 100 killzoneRadius = 0.1 isBoundaryTerminal = IsBoundaryTerminal(xBoundary, yBoundary, getSheepPos) isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius, isBoundaryTerminal) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) actionCostRate = 0.5 getActionCost = GetActionCost(actionCostRate) getReward = RewardWithActionCost(rewardSheep, getActionCost) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) # resetSheepOnly = Reset(xBoundary, yBoundary, numOfAgent = 1) # reset = lambda: list(resetSheepOnly()) +[1, 1] reset = Reset(xBoundary, yBoundary, numAgents) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'wolfSpeed': wolfSpeed, 'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'learningRate': learningRateActor, 'noiseInitVar': noiseInitVariance, 'decay': varianceDiscount, 'gridSize': xBoundary[1] } modelSaveDirectory = "../../trainedDDPGModels/wolfAvoidBoundaryActionCost/resetBoth/smallBoundary" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()