Пример #1
0
 def setUp(self):
     self.sheepId = 0
     self.wolfId = 1
     self.getSheepXPos = GetAgentPosFromState(self.sheepId)
     self.getWolfXPos = GetAgentPosFromState(self.wolfId)
     self.actionMagnitude = 1
     self.wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
         self.getWolfXPos, self.getSheepXPos, self.actionMagnitude)
Пример #2
0
    def testTerminal(self, state, trueTerminal):
        sheepId = 0
        wolfId = 1
        getSheepPos = GetAgentPosFromState(sheepId)
        getWolfPos = GetAgentPosFromState(wolfId)

        isBoundaryTerminal = IsBoundaryTerminal(self.xBoundary, self.yBoundary,
                                                getSheepPos)
        killzoneRadius = 1
        isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius,
                                isBoundaryTerminal)

        terminal = isTerminal(state)
        self.assertEqual(terminal, trueTerminal)
Пример #3
0
    def setUp(self):
        sheepId = 0
        wolfId = 1
        getSheepXPos = GetAgentPosFromState(sheepId)
        getWolfXPos = GetAgentPosFromState(wolfId)
        killzoneRadius = 1
        isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius)

        maxRunningSteps = 20
        sheepAliveBonus = 1 / maxRunningSteps

        sheepTerminalPenalty = -1
        self.rewardSheep = RewardFunctionCompete(sheepAliveBonus,
                                                 sheepTerminalPenalty,
                                                 isTerminal)
Пример #4
0
 def testBoundaryTerminal(self, state, trueTerminal):
     sheepID = 0
     getSheepPos = GetAgentPosFromState(sheepID)
     isBoundaryTerminal = IsBoundaryTerminal(self.xBoundary, self.yBoundary,
                                             getSheepPos)
     terminal = isBoundaryTerminal(state)
     self.assertEqual(terminal, trueTerminal)
def main():
    numAgents = 2
    stateDim = numAgents * 2
    actionLow = -1
    actionHigh = 1
    actionBound = (actionHigh - actionLow) / 2
    actionDim = 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [64]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    dirName = os.path.dirname(__file__)
    actorModelPath = os.path.join(
        dirName, '..', 'trainedDDPGModels',
        'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.001_learningRateCritic=0.001_maxEpisode=2000_maxTimeStep=20_minibatchSize=32_wolfSpeed=0.5.ckpt'
    )
    # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=400_maxTimeStep=100_minibatchSize=32_wolfSpeed=1.ckpt')
    # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=1000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt')
    # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=5000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt')

    restoreVariables(actorModel, actorModelPath)
    sheepPolicy = ActDDPGOneStep(actionLow,
                                 actionHigh,
                                 actByPolicyTrain,
                                 actorModel,
                                 getNoise=None)

    sheepId = 0
    wolfId = 1
    getSheepPos = GetAgentPosFromState(sheepId)
    getWolfPos = GetAgentPosFromState(wolfId)

    wolfSpeed = 0.5
    wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
        getWolfPos, getSheepPos, wolfSpeed)
    xBoundary = (0, 20)
    yBoundary = (0, 20)
    stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
    transit = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary)
    # transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

    maxTimeStep = 20
    sheepAliveBonus = 1 / maxTimeStep
    sheepTerminalPenalty = 20
    killzoneRadius = 0
    isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius)
    getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                  yBoundary,
                                                  sheepIndex=0,
                                                  punishmentVal=10)
    rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty,
                                        isTerminal)
    rewardSheepWithBoundaryHeuristics = RewardSheepWithBoundaryHeuristics(
        rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos)

    getSheepAction = lambda actions: [
        actions[sheepId * actionDim], actions[sheepId * actionDim + 1]
    ]
    getReward = lambda state, action, nextState: rewardSheepWithBoundaryHeuristics(
        state, getSheepAction(action), nextState)

    policy = lambda state: list(sheepPolicy(state)) + list(wolfPolicy(state))
    # reset = Reset(xBoundary, yBoundary, numAgents)
    reset = lambda: np.array([10, 10, 15, 5])

    for i in range(10):
        maxRunningSteps = 50
        sampleTrajectory = SampleTrajectory(maxRunningSteps, transit,
                                            isTerminal, getReward, reset)
        trajectory = sampleTrajectory(policy)

        # plots& plot
        showDemo = True
        if showDemo:
            observe = Observe(trajectory, numAgents)

            fullScreen = False
            screenWidth = 800
            screenHeight = 800
            screen = initializeScreen(fullScreen, screenWidth, screenHeight)

            leaveEdgeSpace = 200
            lineWidth = 3
            xBoundary = [leaveEdgeSpace, screenWidth - leaveEdgeSpace * 2]
            yBoundary = [leaveEdgeSpace, screenHeight - leaveEdgeSpace * 2]
            screenColor = THECOLORS['black']
            lineColor = THECOLORS['white']

            drawBackground = DrawBackground(screen, screenColor, xBoundary,
                                            yBoundary, lineColor, lineWidth)
            circleSize = 10
            positionIndex = [0, 1]
            drawState = DrawState(screen, circleSize, positionIndex,
                                  drawBackground)

            numberOfAgents = 2
            chasingColors = [THECOLORS['green'], THECOLORS['red']]
            colorSpace = chasingColors[:numberOfAgents]

            FPS = 60
            chaseTrial = ChaseTrialWithTraj(FPS,
                                            colorSpace,
                                            drawState,
                                            saveImage=True)

            rawXRange = [0, 20]
            rawYRange = [0, 20]
            scaledXRange = [210, 590]
            scaledYRange = [210, 590]
            scaleTrajectory = ScaleTrajectory(positionIndex, rawXRange,
                                              rawYRange, scaledXRange,
                                              scaledYRange)

            oldFPS = 5
            adjustFPS = AdjustDfFPStoTraj(oldFPS, FPS)

            getTrajectory = lambda rawTrajectory: scaleTrajectory(
                adjustFPS(rawTrajectory))
            positionList = [observe(index) for index in range(len(trajectory))]
            positionListToDraw = getTrajectory(positionList)

            currentDir = os.getcwd()
            parentDir = os.path.abspath(os.path.join(currentDir, os.pardir))
            imageFolderName = 'Demo'
            saveImageDir = os.path.join(os.path.join(parentDir, 'chasingDemo'),
                                        imageFolderName)
            if not os.path.exists(saveImageDir):
                os.makedirs(saveImageDir)

            chaseTrial(numberOfAgents, positionListToDraw, saveImageDir)
Пример #6
0
 def testGetAgentPos(self, agentID, state, truePos):
     getAgentPosFromState = GetAgentPosFromState(agentID)
     pos = getAgentPosFromState(state)
     self.assertEqual(pos, truePos)
Пример #7
0
    def __call__(self, df):
        varianceDiscount = df.index.get_level_values('varianceDiscount')[0]
        bufferSize = df.index.get_level_values('bufferSize')[0]
        layerWidth = df.index.get_level_values('layerWidth')[0]
        print('buffer: ', bufferSize, ', layers: ', layerWidth,
              ', varDiscount: ', varianceDiscount)

        buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
        actorWriter, actorModel = buildActorModel(layerWidth)

        buildCriticModel = BuildCriticModel(stateDim, actionDim)
        criticWriter, criticModel = buildCriticModel(layerWidth)

        trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                                criticWriter)
        trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                                  trainCriticBySASRQ)

        trainActorFromGradients = TrainActorFromGradients(
            learningRateActor, actorWriter)
        trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                              trainActorFromGradients,
                                              getActionGradients)
        trainActor = TrainActor(trainActorOneStep)

        paramUpdateInterval = 1
        updateParameters = UpdateParameters(paramUpdateInterval, tau)

        modelList = [actorModel, criticModel]
        actorModel, criticModel = resetTargetParamToTrainParam(modelList)
        trainModels = TrainDDPGModels(updateParameters, trainActor,
                                      trainCritic, actorModel, criticModel)

        noiseInitVariance = 1
        noiseDecayStartStep = bufferSize
        getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                                 varianceDiscount,
                                                 noiseDecayStartStep)
        actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                             actByPolicyTrain, actorModel,
                                             getNoise)

        learningStartBufferSize = minibatchSize
        sampleFromMemory = SampleFromMemory(minibatchSize)
        learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                          sampleFromMemory, trainModels)

        sheepId = 0
        wolfId = 1
        getSheepXPos = GetAgentPosFromState(sheepId)
        getWolfXPos = GetAgentPosFromState(wolfId)

        wolfSpeed = 2
        wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
            getWolfXPos, getSheepXPos, wolfSpeed)
        xBoundary = (0, 20)
        yBoundary = (0, 20)
        stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
        physicalTransition = TransitForNoPhysics(getIntendedNextState,
                                                 stayWithinBoundary)
        transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

        sheepAliveBonus = 0 / maxTimeStep
        sheepTerminalPenalty = -20

        killzoneRadius = 1
        isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius)
        getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                      yBoundary,
                                                      sheepIndex=0,
                                                      punishmentVal=10)
        rewardSheep = RewardFunctionCompete(sheepAliveBonus,
                                            sheepTerminalPenalty, isTerminal)
        getReward = RewardSheepWithBoundaryHeuristics(rewardSheep,
                                                      getIntendedNextState,
                                                      getBoundaryPunishment,
                                                      getSheepXPos)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                      learnFromBuffer)

        reset = Reset(xBoundary, yBoundary, numAgents)
        runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                                isTerminal)

        ddpg = RunAlgorithm(runEpisode, maxEpisode)

        replayBuffer = deque(maxlen=int(bufferSize))
        meanRewardList, trajectory = ddpg(replayBuffer)

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        return resultSe
Пример #8
0
def main():
    numAgents = 2
    stateDim = numAgents * 2
    actionLow = -1
    actionHigh = 1
    actionBound = (actionHigh - actionLow) / 2
    actionDim = 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [64]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [64]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1
    varianceDiscount = .9995
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    sheepId = 0
    wolfId = 1
    getSheepPos = GetAgentPosFromState(sheepId)
    getWolfPos = GetAgentPosFromState(wolfId)

    wolfSpeed = 1
    wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
        getWolfPos, getSheepPos, wolfSpeed)
    # wolfPolicy = lambda state: (0, 0)

    xBoundary = (0, 20)
    yBoundary = (0, 20)
    stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
    physicalTransition = TransitForNoPhysics(getIntendedNextState,
                                             stayWithinBoundary)
    transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

    sheepAliveBonus = 1 / maxTimeStep
    sheepTerminalPenalty = 20

    killzoneRadius = 1
    isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius)
    getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                  yBoundary,
                                                  sheepIndex=0,
                                                  punishmentVal=10)
    rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty,
                                        isTerminal)
    getReward = RewardSheepWithBoundaryHeuristics(rewardSheep,
                                                  getIntendedNextState,
                                                  getBoundaryPunishment,
                                                  getSheepPos)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    # reset = Reset(xBoundary, yBoundary, numAgents)
    # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action
    # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.]
    # reset = lambda: np.array([15, 10, 10, 10])
    reset = lambda: np.array([10, 10, 15, 5])

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'wolfSpeed': wolfSpeed,
        'dimension': actionDim,
        'maxEpisode': maxEpisode,
        'maxTimeStep': maxTimeStep,
        'minibatchSize': minibatchSize,
        'gamma': gamma,
        'learningRateActor': learningRateActor,
        'learningRateCritic': learningRateCritic
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()