def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = 1 ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0,600] yBoundary = [0,600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[possibleWolvesIds] killzoneRadius = 50 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame(stayInBoundaryByReflectVelocity) numFramesToInterpolate = 3 transit = TransitWithTerminalCheckOfInterpolation(numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 52 timeCost = 1/maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list(map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=0.'+str(numWolves)+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000') sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.5 softPolicyInPlanningForSheep = SoftDistribution(softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep(sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, possibleWolvesIds, sheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds] # Wolves Part # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [2 * (numInWe + 1) for numInWe in range(2, numWolves + 1)] actionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10)] predatorPowerRatio = 8 wolfIndividualActionSpace = list(map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [list(it.product(wolfIndividualActionSpace, repeat = numInWe)) for numInWe in range(2, numWolves + 1)] numWolvesCentralControlActionSpaces = [len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip(numWolvesStateSpaces, numWolvesCentralControlActionSpaces)] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels] NNNumSimulations = 250 wolvesModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId='+str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)]))+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] print(wolvesModelPaths) wolvesCentralControlNNModels = [restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip(initWolvesCentralControlModels, wolvesModelPaths)] wolvesCentralControlPolicies = [ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces)] centralControlPolicyListBasedOnNumAgentsInWe = wolvesCentralControlPolicies # 0 for two agents in We, 1 for three agents... softParameterInInference = 1 softPolicyInInference = SoftDistribution(softParameterInInference) policyForCommittedAgentsInInference = PolicyForCommittedAgent(centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInInference, getStateThirdPersonPerspective) calCommittedAgentsPolicyLikelihood = CalCommittedAgentsPolicyLikelihood(policyForCommittedAgentsInInference) wolfLevel2ActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] wolfLevel2IndividualActionSpace = list(map(tuple, np.array(wolfLevel2ActionSpace) * predatorPowerRatio)) wolfLevel2CentralControlActionSpace = list(it.product(wolfLevel2IndividualActionSpace)) numWolfLevel2ActionSpace = len(wolfLevel2CentralControlActionSpace) regularizationFactor = 1e-4 generatewolfLevel2Models = [GenerateModel(numStateSpace, numWolfLevel2ActionSpace, regularizationFactor) for numStateSpace in numWolvesStateSpaces] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfLevel2NNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initwolfLevel2Models = [generatewolfLevel2Model(sharedWidths * wolfLevel2NNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generatewolfLevel2Model in generatewolfLevel2Models] wolfLevel2ModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=1.'+str(numInWe)+'_depth=9_hierarchy=2_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] wolfLevel2NNModels = [restoreVariables(initwolfLevel2Model, wolfLevel2ModelPath) for initwolfLevel2Model, wolfLevel2ModelPath in zip(initwolfLevel2Models, wolfLevel2ModelPaths)] wolfLevel2Policies = [ApproximatePolicy(wolfLevel2NNModel, wolfLevel2CentralControlActionSpace) for wolfLevel2NNModel in wolfLevel2NNModels] level2PolicyListBasedOnNumAgentsInWe = wolfLevel2Policies # 0 for two agents in We, 1 for three agents... softPolicy = SoftDistribution(2.5) totalInSmallRangeFlags = [] for trial in range(self.numTrajectories): state = reset() while isTerminal(state): state = reset() jointActions = sampleFromDistribution(softPolicy(wolvesCentralControlPolicies[numWolves - 2](state))) hierarchyActions = [] weIds = [list(range(numSheep, numWolves + numSheep)) for _ in range(numWolves)] for index in range(numWolves): weId = weIds[index].copy() weId.insert(0, weId.pop(index)) relativeId = [0] + weId action = sampleFromDistribution(softPolicy(wolfLevel2Policies[numWolves - 2](state[relativeId]))) hierarchyActions.append(action) reasonableActionRange = [int(np.linalg.norm(np.array(jointAction) - np.array(hierarchyAction)) <= 8 * predatorPowerRatio) for jointAction, hierarchyAction in zip(jointActions, hierarchyActions) if jointAction != (0, 0) and hierarchyAction != (0, 0)] totalInSmallRangeFlags = totalInSmallRangeFlags + reasonableActionRange inSmallRangeRateMean = np.mean(totalInSmallRangeFlags) return inSmallRangeRateMean
def main(): numWolves = 2 numSheep = 1 numWolvesStateSpaces = [ 2 * (numInWe + 1) for numInWe in range(2, numWolves + 1) ] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] #actionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10)] predatorPowerRatio = 8 wolfIndividualActionSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [ list(it.product(wolfIndividualActionSpace, repeat=numInWe)) for numInWe in range(2, numWolves + 1) ] numWolvesCentralControlActionSpaces = [ len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces ] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [ GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip( numWolvesStateSpaces, numWolvesCentralControlActionSpaces) ] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [ generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels ] NNNumSimulations = 250 wolvesModelPaths = [ os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=' + str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)])) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=' + str(NNNumSimulations) + '_trainSteps=50000') for numInWe in range(2, numWolves + 1) ] print(wolvesModelPaths) wolvesCentralControlNNModels = [ restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip( initWolvesCentralControlModels, wolvesModelPaths) ] wolvesValueFunctionListBasedOnNumAgentsInWe = [ ApproximateValue(NNModel) for NNModel in wolvesCentralControlNNModels ] valueFunction = wolvesValueFunctionListBasedOnNumAgentsInWe[numWolves - 2] xBoundary = [0, 600] yBoundary = [0, 600] reset = Reset(xBoundary, yBoundary, numWolves) numGridX = 120 numGridY = 120 xInterval = (xBoundary[1] - xBoundary[0]) / numGridX yInterval = (yBoundary[1] - yBoundary[0]) / numGridY sheepXPosition = [(gridIndex + 0.5) * xInterval for gridIndex in range(numGridX)] sheepYPosition = [(gridIndex + 0.5) * yInterval for gridIndex in range(numGridY)] wolvesState = reset() wolvesState = np.array([[300, 350], [550, 400]]) print(wolvesState) levelValues = [sheepXPosition, sheepYPosition] levelNames = ["sheepXPosition", "sheepYPosition"] modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames) toSplitFrame = pd.DataFrame(index=modelIndex) evaluate = lambda df: evaluateValue(df, valueFunction, wolvesState) valueResultDf = toSplitFrame.groupby(levelNames).apply(evaluate) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) drawHeatmapPlot(valueResultDf, ax) fig.savefig('valueMap2', dpi=300) plt.show()
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] softParameterInInference = parameters['inferenceSoft'] softParameterInPlanning = parameters['wolfPolicySoft'] otherCompeteRate = parameters['otherCompeteRate'] competeDetectionRate = parameters['competeDetectionRate'] ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0, 600] yBoundary = [0, 600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[ possibleWolvesIds] killzoneRadius = 50 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame( stayInBoundaryByReflectVelocity) numFramesToInterpolate = 3 transit = TransitWithTerminalCheckOfInterpolation( numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 61 timeCost = 1 / maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list( map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=0.' + str(numWolves) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000' ) sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.0 softPolicyInPlanningForSheep = SoftDistribution( softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep( sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [ SampleActionOnFixedIntention(selfId, possibleWolvesIds, softenSheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds ] # Wolves Part # Percept Action For Inference perceptAction = lambda action: action # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [ 2 * (numInWe + 1) for numInWe in range(2, numWolves + 1) ] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] predatorPowerRatio = 8 wolfIndividualActionSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [ list(it.product(wolfIndividualActionSpace, repeat=numInWe)) for numInWe in range(2, numWolves + 1) ] numWolvesCentralControlActionSpaces = [ len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces ] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [ GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip( numWolvesStateSpaces, numWolvesCentralControlActionSpaces) ] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [ generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels ] NNNumSimulations = 250 wolvesModelPaths = [ os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=' + str(8 * np.sum([10**_ for _ in range(numInWe)])) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=' + str(NNNumSimulations) + '_trainSteps=50000') for numInWe in range(2, numWolves + 1) ] print(wolvesModelPaths) wolvesCentralControlNNModels = [ restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip( initWolvesCentralControlModels, wolvesModelPaths) ] wolvesCentralControlPolicies = [ ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces) ] centralControlPolicyListBasedOnNumAgentsInWe = wolvesCentralControlPolicies # 0 for two agents in We, 1 for three agents... softPolicyInInference = SoftDistribution(softParameterInInference) policyForCommittedAgentsInInference = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInInference, getStateOrActionThirdPersonPerspective) concernedAgentsIds = [2] calCommittedAgentsPolicyLikelihood = CalCommittedAgentsPolicyLikelihood( concernedAgentsIds, policyForCommittedAgentsInInference) getGoalStateForIndividualHeatseeking = lambda statesRelative: np.array( statesRelative)[0] getSelfStateForIndividualHeatseeking = lambda statesRelative: np.array( statesRelative)[1] heatseekingPrecesion = 1.83 heatSeekingDiscreteStochasticPolicy = HeatSeekingDiscreteStochasticPolicy( heatseekingPrecesion, wolfIndividualActionSpace, getSelfStateForIndividualHeatseeking, getGoalStateForIndividualHeatseeking) policyForUncommittedAgentsInInference = PolicyForUncommittedAgent( possibleWolvesIds, heatSeekingDiscreteStochasticPolicy, softPolicyInInference, getStateOrActionFirstPersonPerspective) calUncommittedAgentsPolicyLikelihood = CalUncommittedAgentsPolicyLikelihood( possibleWolvesIds, concernedAgentsIds, policyForUncommittedAgentsInInference) # Joint Likelihood calJointLikelihood = lambda intention, state, perceivedAction: calCommittedAgentsPolicyLikelihood(intention, state, perceivedAction) * \ calUncommittedAgentsPolicyLikelihood(intention, state, perceivedAction) wolvesValueListBasedOnNumAgentsInWe = [ ApproximateValue(NNModel) for NNModel in wolvesCentralControlNNModels ] calIntentionValueGivenState = CalIntentionValueGivenState( wolvesValueListBasedOnNumAgentsInWe) softParamterForValue = 0.01 softValueToBuildDistribution = SoftMax(softParamterForValue) adjustIntentionPriorGivenValueOfState = AdjustIntentionPriorGivenValueOfState( calIntentionValueGivenState, softValueToBuildDistribution) # Sample and Save Trajectory trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): # Intention Prior For inference otherWolfPossibleIntentionSpaces = {0: [(0, (1, 2))], 1: [(0, ())]} otherIntentionType = np.random.choice( [1, 0], p=[otherCompeteRate, 1 - otherCompeteRate]) otherWolfIntentionSpace = otherWolfPossibleIntentionSpaces[ otherIntentionType] selfPossibleIntentionSpaces = { 0: [(0, (1, 2))], 0.5: [(0, (1, 2)), (0, ())], 1: [(0, ())] } selfWolfIntentionSpace = selfPossibleIntentionSpaces[ competeDetectionRate] intentionSpacesForAllWolves = [ selfWolfIntentionSpace, otherWolfIntentionSpace ] wolvesIntentionPriors = [{ tuple(intention): 1 / len(allPossibleIntentionsOneWolf) for intention in allPossibleIntentionsOneWolf } for allPossibleIntentionsOneWolf in intentionSpacesForAllWolves] # Infer and update Intention variablesForAllWolves = [[ intentionSpace ] for intentionSpace in intentionSpacesForAllWolves] jointHypothesisSpaces = [ pd.MultiIndex.from_product(variables, names=['intention']) for variables in variablesForAllWolves ] concernedHypothesisVariable = ['intention'] priorDecayRate = 1 softPrior = SoftDistribution(priorDecayRate) inferIntentionOneStepList = [ InferOneStep(jointHypothesisSpace, concernedHypothesisVariable, calJointLikelihood, softPrior) for jointHypothesisSpace in jointHypothesisSpaces ] chooseIntention = sampleFromDistribution valuePriorEndTime = -100 updateIntentions = [ UpdateIntention(intentionPrior, valuePriorEndTime, adjustIntentionPriorGivenValueOfState, perceptAction, inferIntentionOneStep, chooseIntention) for intentionPrior, inferIntentionOneStep in zip( wolvesIntentionPriors, inferIntentionOneStepList) ] # reset intention and adjuste intention prior attributes tools for multiple trajectory intentionResetAttributes = [ 'timeStep', 'lastState', 'lastAction', 'intentionPrior', 'formerIntentionPriors' ] intentionResetAttributeValues = [ dict( zip(intentionResetAttributes, [0, None, None, intentionPrior, [intentionPrior]])) for intentionPrior in wolvesIntentionPriors ] resetIntentions = ResetObjects(intentionResetAttributeValues, updateIntentions) returnAttributes = ['formerIntentionPriors'] getIntentionDistributions = GetObjectsValuesOfAttributes( returnAttributes, updateIntentions) attributesToRecord = ['lastAction'] recordActionForUpdateIntention = RecordValuesForObjects( attributesToRecord, updateIntentions) # Wovels Generate Action softPolicyInPlanning = SoftDistribution(softParameterInPlanning) policyForCommittedAgentInPlanning = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInPlanning, getStateOrActionThirdPersonPerspective) policyForUncommittedAgentInPlanning = PolicyForUncommittedAgent( possibleWolvesIds, heatSeekingDiscreteStochasticPolicy, softPolicyInPlanning, getStateOrActionFirstPersonPerspective) wolfChooseActionMethod = sampleFromDistribution getSelfActionThirdPersonPerspective = lambda weIds, selfId: list( weIds).index(selfId) chooseCommittedAction = GetActionFromJointActionDistribution( wolfChooseActionMethod, getSelfActionThirdPersonPerspective) chooseUncommittedAction = sampleFromDistribution wolvesSampleIndividualActionGivenIntentionList = [ SampleIndividualActionGivenIntention( selfId, policyForCommittedAgentInPlanning, policyForUncommittedAgentInPlanning, chooseCommittedAction, chooseUncommittedAction) for selfId in possibleWolvesIds ] wolvesSampleActions = [ SampleActionOnChangableIntention( updateIntention, wolvesSampleIndividualActionGivenIntention) for updateIntention, wolvesSampleIndividualActionGivenIntention in zip(updateIntentions, wolvesSampleIndividualActionGivenIntentionList) ] allIndividualSampleActions = sheepSampleActions + wolvesSampleActions sampleActionMultiAgent = SampleActionMultiagent( allIndividualSampleActions, recordActionForUpdateIntention) trajectory = sampleTrajectory(sampleActionMultiAgent) intentionDistributions = getIntentionDistributions() trajectoryWithIntentionDists = [ tuple(list(SASRPair) + list(intentionDist)) for SASRPair, intentionDist in zip(trajectory, intentionDistributions) ] trajectoriesWithIntentionDists.append( tuple(trajectoryWithIntentionDists)) resetIntentions() #print(intentionDistributions[-1], otherCompeteRate) trajectoryFixedParameters = { 'sheepPolicySoft': softParameterInPlanningForSheep, 'wolfPolicySoft': softParameterInPlanning, 'maxRunningSteps': maxRunningSteps, 'competePolicy': 'heatseeking', 'NNNumSimulations': NNNumSimulations, 'heatseekingPrecesion': heatseekingPrecesion } self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectoriesWithIntentionDists]))
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] softParamterForValue = parameters['valuePriorSoftMaxBeta'] valuePriorEndTime = parameters['valuePriorEndTime'] ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0,600] yBoundary = [0,600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[possibleWolvesIds] killzoneRadius = 25 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame(stayInBoundaryByReflectVelocity) numFramesToInterpolate = 5 transit = TransitWithTerminalCheckOfInterpolation(numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 52 timeCost = 1/maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list(map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=0.'+str(numWolves)+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000') sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.5 softPolicyInPlanningForSheep = SoftDistribution(softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep(sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, possibleWolvesIds, softenSheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds] # Wolves Part # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [2 * (numInWe + numSheep) for numInWe in range(2, numWolves + 1)] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] predatorPowerRatio = 8 wolfIndividualActionSpace = list(map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [list(it.product(wolfIndividualActionSpace, repeat = numInWe)) for numInWe in range(2, numWolves + 1)] numWolvesCentralControlActionSpaces = [len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip(numWolvesStateSpaces, numWolvesCentralControlActionSpaces)] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels] NNNumSimulations = 250 wolvesModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=.'+str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)]))+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] print(wolvesModelPaths) wolvesCentralControlNNModels = [restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip(initWolvesCentralControlModels, wolvesModelPaths)] wolvesCentralControlPolicies = [ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces)] # Wovels Generate Action softParameterInPlanning = 2.5 softPolicyInPlanning = SoftDistribution(softParameterInPlanning) wolvesPolicy = lambda state: wolvesCentralControlPolicies[numWolves - 2](state) wolfChooseActionMethod = sampleFromDistribution wolvesSampleAction = lambda state: wolfChooseActionMethod(softPolicyInPlanning(wolvesPolicy(state))) def sampleAction(state): action = list(wolvesSampleAction(state)) + [sheepSampleAction(state) for sheepSampleAction in sheepSampleActions] return action # Sample and Save Trajectory trajectories = [sampleTrajectory(sampleAction) for _ in range(self.numTrajectories)] wolfType = 'sharedReward' trajectoryFixedParameters = {'sheepPolicySoft': softParameterInPlanningForSheep, 'wolfPolicySoft': softParameterInPlanning, 'maxRunningSteps': maxRunningSteps, 'hierarchy': 0, 'NNNumSimulations':NNNumSimulations, 'wolfType': wolfType} self.saveTrajectoryByParameters(trajectories, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectories]))