def setUp(self):
     self.sheepId = 0
     self.actionIndex = 1
     self.decay = 1
     self.actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0),
                         (-7, -7), (0, -10), (7, -7)]
     self.actionToOneHot = lambda action: np.asarray([
         1 if (np.array(action) == np.array(self.actionSpace[index])).all()
         else 0 for index in range(len(self.actionSpace))
     ])
     self.rewardFunction = lambda s, a: 1
     self.anotherRewardFunction = lambda s, a: -1
     self.accumulateRewards = AccumulateRewards(self.decay,
                                                self.rewardFunction)
     self.accumulateMultipleAgentRewards = AccumulateMultiAgentRewards(
         self.decay, [self.rewardFunction, self.anotherRewardFunction])
     self.addValuesToTrajectory = AddValuesToTrajectory(
         self.accumulateRewards)
     self.getTerminalActionFromTrajectory = lambda trajectory: trajectory[
         -1][1]
     self.removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory(
         self.getTerminalActionFromTrajectory)
     self.processTrajectoryForPolicyValueNet = ProcessTrajectoryForPolicyValueNet(
         self.actionToOneHot, self.sheepId)
     self.compareTuples = lambda tuple1, tuple2: all(
         np.array_equal(element1, element2) for element1, element2 in zip(
             tuple1, tuple2)) and len(tuple1) == len(tuple2)
     self.compareTrajectories = lambda traj1, traj2: all(
         self.compareTuples(tuple1, tuple2) for tuple1, tuple2 in zip(
             traj1, traj2)) and len(traj1) == len(traj2)
 def testAddValuesToTraj(self, traj, decay, groundTruthTrajWithValues):
     self.accumulateRewards = AccumulateRewards(decay, self.rewardFunction)
     self.addValuesToTrajectory = AddValuesToTrajectory(
         self.accumulateRewards)
     trajWithValues = self.addValuesToTrajectory(traj)
     for transition, groundTruthTransition in zip(
             trajWithValues, groundTruthTrajWithValues):
         self.assertEqual(transition[0:4], groundTruthTransition[0:4])
 def testAddMultiAgentValuesToTraj(self, traj, decay,
                                   groundTruthTrajWithValues):
     accRewards = AccumulateMultiAgentRewards(
         decay, [self.rewardFunction, self.anotherRewardFunction])
     self.addValuesToTrajectory = AddValuesToTrajectory(accRewards)
     trajWithValues = self.addValuesToTrajectory(traj)
     for transition, groundTruthTransition in zip(
             trajWithValues, groundTruthTrajWithValues):
         self.assertEqual(transition[0:3], groundTruthTransition[0:3])
         self.assertTrue(np.all(transition[3] == groundTruthTransition[3]))
예제 #4
0
def main():
    # manipulated variables
    manipulatedVariables = OrderedDict()
    manipulatedVariables['dataSize'] = [1000, 2000, 3000]
    manipulatedVariables['depth'] = [5, 9]
    manipulatedVariables['trainSteps'] = list(range(0, 50001, 10000))

    levelNames = list(manipulatedVariables.keys())
    levelValues = list(manipulatedVariables.values())
    modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames)
    toSplitFrame = pd.DataFrame(index=modelIndex)

    killzoneRadius = 25
    maxRunningSteps = 100
    numSimulations = 200
    # accumulate rewards for trajectories
    numOfAgent = 3
    sheepId = 0
    wolvesId = 1

    wolfOneId = 1
    wolfTwoId = 2
    xPosIndex = [0, 1]
    xBoundary = [0, 600]
    yBoundary = [0, 600]

    getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex)
    getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex)
    getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex)

    isTerminalOne = IsTerminal(getWolfOneXPos, getSheepXPos, killzoneRadius)
    isTerminalTwo = IsTerminal(getWolfTwoXPos, getSheepXPos, killzoneRadius)
    playIsTerminal = lambda state: isTerminalOne(state) or isTerminalTwo(state)

    stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(
        xBoundary, yBoundary)
    transit = TransiteForNoPhysics(stayInBoundaryByReflectVelocity)

    playAliveBonus = -1 / maxRunningSteps
    playDeathPenalty = 1
    playKillzoneRadius = killzoneRadius
    playReward = RewardFunctionCompete(playAliveBonus, playDeathPenalty,
                                       playIsTerminal)

    decay = 1
    accumulateRewards = AccumulateRewards(decay, playReward)
    addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards)

    # generate trajectory parallel
    generateTrajectoriesCodeName = 'generateCenterControlTrajectoryByCondition.py'
    evalNumTrials = 500
    numCpuCores = os.cpu_count()
    numCpuToUse = int(0.75 * numCpuCores)
    numCmdList = min(evalNumTrials, numCpuToUse)
    generateTrajectoriesParallel = GenerateTrajectoriesParallel(
        generateTrajectoriesCodeName, evalNumTrials, numCmdList)

    # run all trials and save trajectories
    generateTrajectoriesParallelFromDf = lambda df: generateTrajectoriesParallel(
        readParametersFromDf(df))
    # toSplitFrame.groupby(levelNames).apply(generateTrajectoriesParallelFromDf)

    # save evaluation trajectories
    dirName = os.path.dirname(__file__)
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', '..', 'data', 'evaluateSupervisedLearning',
        'multiMCTSAgentResNetNoPhysicsCenterControl',
        'evaluateCenterControlTrajByCondition')

    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)
    trajectoryExtension = '.pickle'

    trajectoryFixedParameters = {
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius
    }

    getTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                        trajectoryExtension,
                                        trajectoryFixedParameters)
    getTrajectorySavePathFromDf = lambda df: getTrajectorySavePath(
        readParametersFromDf(df))

    # compute statistics on the trajectories
    fuzzySearchParameterNames = []
    loadTrajectories = LoadTrajectories(getTrajectorySavePath, loadFromPickle,
                                        fuzzySearchParameterNames)
    loadTrajectoriesFromDf = lambda df: loadTrajectories(
        readParametersFromDf(df))
    measurementFunction = lambda trajectory: accumulateRewards(trajectory)[0]
    computeStatistics = ComputeStatistics(loadTrajectoriesFromDf,
                                          measurementFunction)
    statisticsDf = toSplitFrame.groupby(levelNames).apply(computeStatistics)

    def calculateSuriveRatio(trajectory):
        lenght = np.array(len(trajectory))
        count = np.array(
            [lenght < 50, lenght >= 50 and lenght < 100, lenght >= 100])
        return count

    computeNumbers = ComputeStatistics(loadTrajectoriesFromDf,
                                       calculateSuriveRatio)
    df = toSplitFrame.groupby(levelNames).apply(computeNumbers)
    print(df)

    fig = plt.figure()
    numRows = 1
    numColumns = 1
    plotCounter = 1
    axForDraw = fig.add_subplot(numRows, numColumns, plotCounter)
    xlabel = ['0-50', '50-100', '100-150']
    x = np.arange(len(xlabel))

    numTrials = 500
    yMean = df['mean'].tolist()
    yRrr = np.array(df['std'].tolist()) / (np.sqrt(numTrials) - 1)

    totalWidth, n = 0.6, 3
    width = totalWidth / n

    x = x - (totalWidth - width) / 2
    plt.bar(x, yMean[0], yerr=yRrr[0], width=width, label='trainStep0')
    plt.bar(x + width,
            yMean[1],
            yerr=yRrr[1],
            width=width,
            label='trainStep10000')
    plt.bar(x + width * 2,
            yMean[2],
            yerr=yRrr[2],
            width=width,
            label='trainStep30000')
    plt.bar(x + width * 3,
            yMean[3],
            yerr=yRrr[3],
            width=width,
            label='trainStep50000')
    plt.suptitle('dataSize 3000')
    plt.xticks(x, xlabel)
    plt.ylim(0, 1)
    plt.xlabel('living steps')
    plt.legend(loc='best')
    # plt.show()

    # plot the results
    fig = plt.figure()
    numRows = len(manipulatedVariables['depth'])
    numColumns = len(manipulatedVariables['dataSize'])
    plotCounter = 1
    print(statisticsDf)
    for depth, grp in statisticsDf.groupby('depth'):
        grp.index = grp.index.droplevel('depth')

        for dataSize, group in grp.groupby('dataSize'):
            group.index = group.index.droplevel('dataSize')

            axForDraw = fig.add_subplot(numRows, numColumns, plotCounter)
            if plotCounter % numColumns == 1:
                axForDraw.set_ylabel('depth: {}'.format(depth))
            if plotCounter <= numColumns:
                axForDraw.set_title('dataSize: {}'.format(dataSize))

            axForDraw.set_ylim(-1, 1)
            # plt.ylabel('Accumulated rewards')
            maxTrainSteps = manipulatedVariables['trainSteps'][-1]

            plt.plot([0, maxTrainSteps], [0.354] * 2,
                     '--m',
                     color="#1C2833",
                     label='pure MCTS')

            group.plot(ax=axForDraw,
                       y='mean',
                       yerr='std',
                       marker='o',
                       logx=False)

            plotCounter += 1

    plt.suptitle('center control wolves')
    plt.legend(loc='best')
    plt.show()
예제 #5
0
def trainOneCondition(manipulatedVariables):
    depth = int(manipulatedVariables['depth'])
    # Get dataset for training
    DIRNAME = os.path.dirname(__file__)
    dataSetDirectory = os.path.join(dirName, '..', '..', '..', '..', 'data',
                                    'NoPhysics2wolves1sheep',
                                    'trainWolvesTwoCenterControlAction88',
                                    'trajectories')

    if not os.path.exists(dataSetDirectory):
        os.makedirs(dataSetDirectory)

    dataSetExtension = '.pickle'
    dataSetMaxRunningSteps = 50
    dataSetNumSimulations = 250
    killzoneRadius = 150
    agentId = 1
    wolvesId = 1
    dataSetFixedParameters = {
        'agentId': agentId,
        'maxRunningSteps': dataSetMaxRunningSteps,
        'numSimulations': dataSetNumSimulations,
        'killzoneRadius': killzoneRadius
    }

    getDataSetSavePath = GetSavePath(dataSetDirectory, dataSetExtension,
                                     dataSetFixedParameters)
    print("DATASET LOADED!")

    numOfAgent = 3
    # accumulate rewards for trajectories
    decay = 1
    accumulateRewards = AccumulateRewards(decay)
    addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards)

    # pre-process the trajectories
    actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                   (0, -10), (7, -7), (0, 0)]
    preyPowerRatio = 10
    sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio))

    predatorPowerRatio = 8
    wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                       (0, -10), (7, -7)]
    wolfActionOneSpace = list(
        map(tuple,
            np.array(wolfActionSpace) * predatorPowerRatio))
    wolfActionTwoSpace = list(
        map(tuple,
            np.array(wolfActionSpace) * predatorPowerRatio))
    wolvesActionSpace = list(it.product(wolfActionOneSpace,
                                        wolfActionTwoSpace))

    numActionSpace = len(wolvesActionSpace)

    actionIndex = 1
    actionToOneHot = ActionToOneHot(wolvesActionSpace)
    getTerminalActionFromTrajectory = lambda trajectory: trajectory[-1][
        actionIndex]
    removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory(
        getTerminalActionFromTrajectory)
    processTrajectoryForNN = ProcessTrajectoryForPolicyValueNet(
        actionToOneHot, wolvesId)

    preProcessTrajectories = PreProcessTrajectories(
        addValuesToTrajectory, removeTerminalTupleFromTrajectory,
        processTrajectoryForNN)

    fuzzySearchParameterNames = ['sampleIndex']
    loadTrajectories = LoadTrajectories(getDataSetSavePath, loadFromPickle,
                                        fuzzySearchParameterNames)
    loadedTrajectories = loadTrajectories(parameters={})
    print(loadedTrajectories[0])

    filterState = lambda timeStep: (timeStep[0][:numOfAgent], timeStep[1],
                                    timeStep[2], timeStep[3])
    trajectories = [[filterState(timeStep) for timeStep in trajectory]
                    for trajectory in loadedTrajectories]
    print(len(trajectories))

    preProcessedTrajectories = np.concatenate(
        preProcessTrajectories(trajectories))

    trainData = [list(varBatch) for varBatch in zip(*preProcessedTrajectories)]
    valuedTrajectories = [addValuesToTrajectory(tra) for tra in trajectories]

    # neural network init and save path
    numStateSpace = 6
    regularizationFactor = 1e-4
    sharedWidths = [128]
    actionLayerWidths = [128]
    valueLayerWidths = [128]

    generateModel = GenerateModel(numStateSpace, numActionSpace,
                                  regularizationFactor)

    resBlockSize = 2
    dropoutRate = 0.0
    initializationMethod = 'uniform'
    sheepNNModel = generateModel(sharedWidths * depth, actionLayerWidths,
                                 valueLayerWidths, resBlockSize,
                                 initializationMethod, dropoutRate)

    initTimeStep = 0
    valueIndex = 3
    trainDataMeanAccumulatedReward = np.mean(
        [tra[initTimeStep][valueIndex] for tra in valuedTrajectories])
    print(trainDataMeanAccumulatedReward)

    # function to train NN model
    terminalThreshold = 1e-10
    lossHistorySize = 10
    initActionCoeff = 1
    initValueCoeff = 1
    initCoeff = (initActionCoeff, initValueCoeff)
    afterActionCoeff = 1
    afterValueCoeff = 1
    afterCoeff = (afterActionCoeff, afterValueCoeff)
    terminalController = lambda evalDict, numSteps: False
    coefficientController = CoefficientCotroller(initCoeff, afterCoeff)
    reportInterval = 10000
    trainStepsIntervel = 10000
    trainReporter = TrainReporter(trainStepsIntervel, reportInterval)
    learningRateDecay = 1
    learningRateDecayStep = 1
    learningRateModifier = lambda learningRate: LearningRateModifier(
        learningRate, learningRateDecay, learningRateDecayStep)
    getTrainNN = lambda batchSize, learningRate: Train(
        trainStepsIntervel, batchSize, sampleData,
        learningRateModifier(learningRate), terminalController,
        coefficientController, trainReporter)

    # get path to save trained models
    NNModelFixedParameters = {
        'agentId': agentId,
        'maxRunningSteps': dataSetMaxRunningSteps,
        'numSimulations': dataSetNumSimulations
    }

    NNModelSaveDirectory = os.path.join(dirName, '..', '..', '..', '..',
                                        'data', 'NoPhysics2wolves1sheep',
                                        'trainWolvesTwoCenterControlAction88',
                                        'trainedResNNModels')
    if not os.path.exists(NNModelSaveDirectory):
        os.makedirs(NNModelSaveDirectory)
    NNModelSaveExtension = ''
    getNNModelSavePath = GetSavePath(NNModelSaveDirectory,
                                     NNModelSaveExtension,
                                     NNModelFixedParameters)

    # function to train models
    numOfTrainStepsIntervel = 6
    trainIntervelIndexes = list(range(numOfTrainStepsIntervel))
    trainModelForConditions = TrainModelForConditions(trainIntervelIndexes,
                                                      trainStepsIntervel,
                                                      trainData, sheepNNModel,
                                                      getTrainNN,
                                                      getNNModelSavePath)
    trainModelForConditions(manipulatedVariables)
예제 #6
0
def main():
    # important parameters

    # manipulated variables

    manipulatedVariables = OrderedDict()

    manipulatedVariables['miniBatchSize'] = [64, 256]
    manipulatedVariables['learningRate'] = [1e-3, 1e-4, 1e-5]
    manipulatedVariables['depth'] = [5, 9, 17]  #[4,8,16]#
    manipulatedVariables['trainSteps'] = [0, 5000, 10000, 20000, 50000]
    levelNames = list(manipulatedVariables.keys())
    levelValues = list(manipulatedVariables.values())
    modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames)
    toSplitFrame = pd.DataFrame(index=modelIndex)

    # accumulate rewards for trajectories
    sheepId = 0
    wolfId = 1

    xPosIndex = [0, 1]
    getSheepPos = GetAgentPosFromState(sheepId, xPosIndex)
    getWolfPos = GetAgentPosFromState(wolfId, xPosIndex)

    killzoneRadius = 2
    numSimulations = 150
    maxRunningSteps = 30
    agentId = 1

    playAliveBonus = -1 / maxRunningSteps
    playDeathPenalty = 1
    playKillzoneRadius = killzoneRadius
    playIsTerminal = IsTerminal(playKillzoneRadius, getSheepPos, getWolfPos)
    playReward = RewardFunctionCompete(playAliveBonus, playDeathPenalty,
                                       playIsTerminal)

    decay = 1
    accumulateRewards = AccumulateRewards(decay, playReward)
    addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards)

    # generate trajectory parallel
    # generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryFixObstacle.py'
    # generateTrajectoriesCodeName = 'generateWolfNNEvaluationTrajectoryFixObstacle.py'
    # generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryMovedObstacle.py'
    generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryRandomObstacle.py'
    # generateTrajectoriesCodeName = 'generateWolfNNEvaluationTrajectoryRandomObstacle.py'
    evalNumTrials = 100
    numCpuCores = os.cpu_count()
    numCpuToUse = int(0.75 * numCpuCores)
    numCmdList = min(evalNumTrials, numCpuToUse)
    generateTrajectoriesParallel = GenerateTrajectoriesParallel(
        generateTrajectoriesCodeName, evalNumTrials, numCmdList)

    # run all trials and save trajectories
    generateTrajectoriesParallelFromDf = lambda df: generateTrajectoriesParallel(
        readParametersFromDf(df))
    toSplitFrame.groupby(levelNames).apply(generateTrajectoriesParallelFromDf)

    # save evaluation trajectories
    dirName = os.path.dirname(__file__)
    dataFolderName = os.path.join(dirName, '..', '..', '..', 'data',
                                  'multiAgentTrain', 'MCTSRandomObstacle')
    trajectoryDirectory = os.path.join(
        dataFolderName, 'evaluationTrajectoriesResNNWithObstacle')

    if not os.path.exists(trajectoryDirectory):
        os.makedirs(trajectoryDirectory)
    trajectoryExtension = '.pickle'

    trajectoryFixedParameters = {
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius,
        'agentId': agentId
    }

    getTrajectorySavePath = GetSavePath(trajectoryDirectory,
                                        trajectoryExtension,
                                        trajectoryFixedParameters)
    getTrajectorySavePathFromDf = lambda df: getTrajectorySavePath(
        readParametersFromDf(df))

    # compute statistics on the trajectories

    fuzzySearchParameterNames = ['sampleIndex']
    loadTrajectories = LoadTrajectories(getTrajectorySavePath, loadFromPickle,
                                        fuzzySearchParameterNames)
    loadTrajectoriesFromDf = lambda df: loadTrajectories(
        readParametersFromDf(df))
    measurementFunction = lambda trajectory: accumulateRewards(trajectory)[0]

    computeStatistics = ComputeStatistics(loadTrajectoriesFromDf,
                                          measurementFunction)
    statisticsDf = toSplitFrame.groupby(levelNames).apply(computeStatistics)
    print(statisticsDf)

    # manipulatedVariables['miniBatchSize'] = [64, 128]
    # manipulatedVariables['learningRate'] =  [ 1e-3,1e-4,1e-5]
    # manipulatedVariables['depth'] = [4,8,16]
    # manipulatedVariables['trainSteps']=[0,20000,40000,60000,100000,180000]

    # plot the results
    fig = plt.figure()
    numRows = len(manipulatedVariables['depth'])
    numColumns = len(manipulatedVariables['learningRate'])
    plotCounter = 1
    selfId = 0
    for depth, grp in statisticsDf.groupby('depth'):
        grp.index = grp.index.droplevel('depth')

        for learningRate, group in grp.groupby('learningRate'):
            group.index = group.index.droplevel('learningRate')

            axForDraw = fig.add_subplot(numRows, numColumns, plotCounter)
            if (plotCounter % numColumns == 1) or numColumns == 1:
                axForDraw.set_ylabel('depth: {}'.format(depth))
            if plotCounter <= numColumns:
                axForDraw.set_title('learningRate: {}'.format(learningRate))

            axForDraw.set_ylim(-1, 1)
            drawPerformanceLine(group, axForDraw, selfId)
            plotCounter += 1

    plt.suptitle('SupervisedNNWolfwithRandomWallState')
    plt.legend(loc='best')
    plt.show()
예제 #7
0
def iterateTrainOneCondition(parameterOneCondition):

    numTrainStepEachIteration = int(
        parameterOneCondition['numTrainStepEachIteration'])
    numTrajectoriesPerIteration = int(
        parameterOneCondition['numTrajectoriesPerIteration'])
    dirName = os.path.dirname(__file__)

    numOfAgent = 2
    agentIds = list(range(numOfAgent))

    maxRunningSteps = 50
    numSimulations = 250
    killzoneRadius = 50
    fixedParameters = {
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius
    }
    # env MDP
    sheepsID = [0]
    wolvesID = [1, 2]
    blocksID = []

    numSheeps = len(sheepsID)
    numWolves = len(wolvesID)
    numBlocks = len(blocksID)

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks

    sheepSize = 0.05
    wolfSize = 0.075
    blockSize = 0.2

    sheepMaxSpeed = 1.3 * 1
    wolfMaxSpeed = 1.0 * 1
    blockMaxSpeed = None

    entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [
        blockSize
    ] * numBlocks
    entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [
        wolfMaxSpeed
    ] * numWolves + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    centralControlId = 1
    centerControlIndexList = [centralControlId]
    reshapeAction = UnpackCenterControlAction(centerControlIndexList)
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction,
                                                applyActionForce,
                                                applyEnvironForce,
                                                integrateState)

    numFramesToInterpolate = 1

    def transit(state, action):
        for frameIndex in range(numFramesToInterpolate):
            nextState = interpolateState(state, action)
            action = np.array([(0, 0)] * numAgents)
            state = nextState
        return nextState

    isTerminal = lambda state: False

    isCollision = IsCollision(getPosFromAgentState)
    collisonRewardWolf = 1
    punishForOutOfBound = PunishForOutOfBound()
    rewardWolf = RewardCentralControlPunishBond(
        wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState,
        isCollision, punishForOutOfBound, collisonRewardWolf)
    collisonRewardSheep = -1
    rewardSheep = RewardCentralControlPunishBond(
        sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState,
        isCollision, punishForOutOfBound, collisonRewardSheep)

    resetState = ResetMultiAgentChasing(numAgents, numBlocks)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    # policy
    actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                   (0, -10), (7, -7), (0, 0)]
    wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7),
                       (0, -10), (7, -7), (0, 0)]

    preyPowerRatio = 0.5
    sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio))

    predatorPowerRatio = 0.5
    wolfActionOneSpace = list(
        map(tuple,
            np.array(wolfActionSpace) * predatorPowerRatio))
    wolfActionTwoSpace = list(
        map(tuple,
            np.array(wolfActionSpace) * predatorPowerRatio))

    wolvesActionSpace = list(it.product(wolfActionOneSpace,
                                        wolfActionTwoSpace))

    actionSpaceList = [sheepActionSpace, wolvesActionSpace]

    # neural network init
    numStateSpace = 4 * numEntities
    numSheepActionSpace = len(sheepActionSpace)
    numWolvesActionSpace = len(wolvesActionSpace)

    regularizationFactor = 1e-4
    sharedWidths = [128]
    actionLayerWidths = [128]
    valueLayerWidths = [128]

    generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace,
                                       regularizationFactor)
    generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace,
                                        regularizationFactor)
    generateModelList = [generateSheepModel, generateWolvesModel]

    sheepDepth = 9
    wolfDepth = 9
    depthList = [sheepDepth, wolfDepth]
    resBlockSize = 2
    dropoutRate = 0.0
    initializationMethod = 'uniform'
    multiAgentNNmodel = [
        generateModel(sharedWidths * depth, actionLayerWidths,
                      valueLayerWidths, resBlockSize, initializationMethod,
                      dropoutRate)
        for depth, generateModel in zip(depthList, generateModelList)
    ]

    # replay buffer
    bufferSize = 20000
    saveToBuffer = SaveToBuffer(bufferSize)

    def getUniformSamplingProbabilities(buffer):
        return [(1 / len(buffer)) for _ in buffer]

    miniBatchSize = 512
    sampleBatchFromBuffer = SampleBatchFromBuffer(
        miniBatchSize, getUniformSamplingProbabilities)

    # pre-process the trajectory for replayBuffer
    rewardMultiAgents = [rewardSheep, rewardWolf]
    decay = 1
    accumulateMultiAgentRewards = AccumulateMultiAgentRewards(decay)

    addMultiAgentValuesToTrajectory = AddValuesToTrajectory(
        accumulateMultiAgentRewards)
    actionIndex = 1

    def getTerminalActionFromTrajectory(trajectory):
        return trajectory[-1][actionIndex]

    removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory(
        getTerminalActionFromTrajectory)

    # pre-process the trajectory for NNTraining
    sheepActionToOneHot = ActionToOneHot(sheepActionSpace)
    wolvesActionToOneHot = ActionToOneHot(wolvesActionSpace)
    actionToOneHotList = [sheepActionToOneHot, wolvesActionToOneHot]
    processTrajectoryForPolicyValueNets = [
        ProcessTrajectoryForPolicyValueNetMultiAgentReward(
            actionToOneHotList[agentId], agentId) for agentId in agentIds
    ]

    # function to train NN model
    terminalThreshold = 1e-6
    lossHistorySize = 10
    initActionCoeff = 1
    initValueCoeff = 1
    initCoeff = (initActionCoeff, initValueCoeff)
    afterActionCoeff = 1
    afterValueCoeff = 1
    afterCoeff = (afterActionCoeff, afterValueCoeff)

    terminalController = TrainTerminalController(lossHistorySize,
                                                 terminalThreshold)
    coefficientController = CoefficientCotroller(initCoeff, afterCoeff)

    reportInterval = 10000
    trainStepsIntervel = 1  # 10000

    trainReporter = TrainReporter(numTrainStepEachIteration, reportInterval)
    learningRateDecay = 1
    learningRateDecayStep = 1
    learningRate = 0.0001
    learningRateModifier = LearningRateModifier(learningRate,
                                                learningRateDecay,
                                                learningRateDecayStep)

    trainNN = Train(numTrainStepEachIteration, miniBatchSize, sampleData,
                    learningRateModifier, terminalController,
                    coefficientController, trainReporter)

    # load save dir

    trajectorySaveExtension = '.pickle'
    NNModelSaveExtension = ''
    trajectoriesSaveDirectory = os.path.join(
        dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv',
        'trajectories')
    if not os.path.exists(trajectoriesSaveDirectory):
        os.makedirs(trajectoriesSaveDirectory)

    NNModelSaveDirectory = os.path.join(dirName, '..', '..', 'data',
                                        'iterTrain2wolves1sheepMADDPGEnv',
                                        'NNModelRes')
    if not os.path.exists(NNModelSaveDirectory):
        os.makedirs(NNModelSaveDirectory)

    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                             trajectorySaveExtension,
                                             fixedParameters)
    generateNNModelSavePath = GetSavePath(NNModelSaveDirectory,
                                          NNModelSaveExtension,
                                          fixedParameters)

    startTime = time.time()

    sheepDepth = 9
    wolfDepth = 9
    depthList = [sheepDepth, wolfDepth]
    resBlockSize = 2
    dropoutRate = 0.0
    initializationMethod = 'uniform'
    multiAgentNNmodel = [
        generateModel(sharedWidths * depth, actionLayerWidths,
                      valueLayerWidths, resBlockSize, initializationMethod,
                      dropoutRate)
        for depth, generateModel in zip(depthList, generateModelList)
    ]

    preprocessMultiAgentTrajectories = PreprocessTrajectoriesForBuffer(
        addMultiAgentValuesToTrajectory, removeTerminalTupleFromTrajectory)
    numTrajectoriesToStartTrain = 1024

    trainOneAgent = TrainOneAgent(numTrainStepEachIteration,
                                  numTrajectoriesToStartTrain,
                                  processTrajectoryForPolicyValueNets,
                                  sampleBatchFromBuffer, trainNN)

    # restorePretrainModel
    sheepPreTrainModelPath = os.path.join(
        dirName, '..', '..', 'data', 'MADDPG2wolves1sheep',
        'trainSheepWithPretrrainWolves', 'trainedResNNModels',
        'agentId=0_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000'
    )

    wolvesPreTrainModelPath = os.path.join(
        dirName, '..', '..', 'data', 'MADDPG2wolves1sheep',
        'trainWolvesTwoCenterControlAction', 'trainedResNNModels',
        'agentId=1_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000'
    )

    pretrainModelPathList = [sheepPreTrainModelPath, wolvesPreTrainModelPath]

    sheepId, wolvesId = [0, 1]
    trainableAgentIds = [sheepId, wolvesId]
    for agentId in trainableAgentIds:

        restoredNNModel = restoreVariables(multiAgentNNmodel[agentId],
                                           pretrainModelPathList[agentId])
        multiAgentNNmodel[agentId] = restoredNNModel

        NNModelPathParameters = {
            'iterationIndex': 0,
            'agentId': agentId,
            'numTrajectoriesPerIteration': numTrajectoriesPerIteration,
            'numTrainStepEachIteration': numTrainStepEachIteration
        }
        NNModelSavePath = generateNNModelSavePath(NNModelPathParameters)
        saveVariables(multiAgentNNmodel[agentId], NNModelSavePath)

    fuzzySearchParameterNames = ['sampleIndex']
    loadTrajectoriesForParallel = LoadTrajectories(generateTrajectorySavePath,
                                                   loadFromPickle,
                                                   fuzzySearchParameterNames)
    loadTrajectoriesForTrainBreak = LoadTrajectories(
        generateTrajectorySavePath, loadFromPickle)

    # initRreplayBuffer
    replayBuffer = []
    trajectoryBeforeTrainIndex = 0
    trajectoryBeforeTrainPathParamters = {
        'iterationIndex': trajectoryBeforeTrainIndex
    }
    trajectoriesBeforeTrain = loadTrajectoriesForParallel(
        trajectoryBeforeTrainPathParamters)
    preProcessedTrajectoriesBeforeTrain = preprocessMultiAgentTrajectories(
        trajectoriesBeforeTrain)
    replayBuffer = saveToBuffer(replayBuffer,
                                preProcessedTrajectoriesBeforeTrain)

    # delete used model for disk space
    fixedParametersForDelete = {
        'maxRunningSteps': maxRunningSteps,
        'numSimulations': numSimulations,
        'killzoneRadius': killzoneRadius,
        'numTrajectoriesPerIteration': numTrajectoriesPerIteration,
        'numTrainStepEachIteration': numTrainStepEachIteration
    }
    toDeleteNNModelExtensionList = ['.meta', '.index', '.data-00000-of-00001']
    generatetoDeleteNNModelPathList = [
        GetSavePath(NNModelSaveDirectory, toDeleteNNModelExtension,
                    fixedParametersForDelete)
        for toDeleteNNModelExtension in toDeleteNNModelExtensionList
    ]

    # restore model
    restoredIteration = 0
    for agentId in trainableAgentIds:
        modelPathForRestore = generateNNModelSavePath({
            'iterationIndex':
            restoredIteration,
            'agentId':
            agentId,
            'numTrajectoriesPerIteration':
            numTrajectoriesPerIteration,
            'numTrainStepEachIteration':
            numTrainStepEachIteration
        })
        restoredNNModel = restoreVariables(multiAgentNNmodel[agentId],
                                           modelPathForRestore)
        multiAgentNNmodel[agentId] = restoredNNModel


# restore buffer
    bufferTrajectoryPathParameters = {
        'numTrajectoriesPerIteration': numTrajectoriesPerIteration,
        'numTrainStepEachIteration': numTrainStepEachIteration
    }
    restoredIterationIndexRange = range(restoredIteration)
    restoredTrajectories = loadTrajectoriesForTrainBreak(
        parameters=bufferTrajectoryPathParameters,
        parametersWithSpecificValues={
            'iterationIndex': list(restoredIterationIndexRange)
        })
    preProcessedRestoredTrajectories = preprocessMultiAgentTrajectories(
        restoredTrajectories)
    print(len(preProcessedRestoredTrajectories))
    replayBuffer = saveToBuffer(replayBuffer, preProcessedRestoredTrajectories)

    modelMemorySize = 5
    modelSaveFrequency = 50
    deleteUsedModel = DeleteUsedModel(modelMemorySize, modelSaveFrequency,
                                      generatetoDeleteNNModelPathList)
    numIterations = 10000
    for iterationIndex in range(restoredIteration + 1, numIterations):
        print('iterationIndex: ', iterationIndex)

        numCpuToUseWhileTrain = int(16)
        numCmdList = min(numTrajectoriesPerIteration, numCpuToUseWhileTrain)
        sampleTrajectoryFileName = 'sampleMultiMCTSAgentCenterControlResNetTrajCondtion.py'

        generateTrajectoriesParallelWhileTrain = GenerateTrajectoriesParallel(
            sampleTrajectoryFileName, numTrajectoriesPerIteration, numCmdList)
        trajectoryPathParameters = {
            'iterationIndex': iterationIndex,
            'numTrajectoriesPerIteration': numTrajectoriesPerIteration,
            'numTrainStepEachIteration': numTrainStepEachIteration
        }
        cmdList = generateTrajectoriesParallelWhileTrain(
            trajectoryPathParameters)

        trajectories = loadTrajectoriesForParallel(trajectoryPathParameters)
        trajectorySavePath = generateTrajectorySavePath(
            trajectoryPathParameters)
        saveToPickle(trajectories, trajectorySavePath)

        preProcessedTrajectories = preprocessMultiAgentTrajectories(
            trajectories)
        updatedReplayBuffer = saveToBuffer(replayBuffer,
                                           preProcessedTrajectories)

        for agentId in trainableAgentIds:

            updatedAgentNNModel = trainOneAgent(agentId, multiAgentNNmodel,
                                                updatedReplayBuffer)

            NNModelPathParameters = {
                'iterationIndex': iterationIndex,
                'agentId': agentId,
                'numTrajectoriesPerIteration': numTrajectoriesPerIteration,
                'numTrainStepEachIteration': numTrainStepEachIteration
            }
            NNModelSavePath = generateNNModelSavePath(NNModelPathParameters)
            saveVariables(updatedAgentNNModel, NNModelSavePath)
            multiAgentNNmodel[agentId] = updatedAgentNNModel
            replayBuffer = updatedReplayBuffer

            deleteUsedModel(iterationIndex, agentId)

    endTime = time.time()
    print("Time taken for {} iterations: {} seconds".format(
        numIterations, (endTime - startTime)))