def __call__(self,sampleAction): self.sampleAction = sampleAction numMDPTimeStepPerSecond = 5 maxRunningSteps = 25 * numMDPTimeStepPerSecond trajectories = [] sampleTrajecoty = SampleTrajectory(maxRunningSteps, self.isTerminal, self.resetState, self.forwardOneStep) # randomPolicy = RandomPolicy(actionSpace) # sampleAction = lambda state: sampleFromDistribution(randomPolicy(state)) # random policy # sampleAction = lambda state: actionSpace[dqn.GetMaxAction(flatten(state))] # dqn for trajectoryId in range(self.numTrajectories): trajectory = sampleTrajecoty(self.sampleAction) trajectories.append(trajectory) return trajectories
def __call__(self, parameters): numOfAgent = parameters['numOfAgent'] trajectories = [] for trajectoryId in range(self.numTrajectories): forwardOneStep = self.composeFowardOneTimeStepWithRandomSubtlety( numOfAgent) sheepId = 0 wolfId = 1 distractorsIds = list(range(2, numOfAgent)) distanceToVisualDegreeRatio = 20 minInitSheepWolfDistance = 9 * distanceToVisualDegreeRatio minInitSheepDistractorDistance = 2.5 * distanceToVisualDegreeRatio # no distractor in killzone when init isLegalInitPositions = IsLegalInitPositions( sheepId, wolfId, distractorsIds, minInitSheepWolfDistance, minInitSheepDistractorDistance) xBoundary = [0, 600] yBoundary = [0, 600] resetState = ResetState(xBoundary, yBoundary, numOfAgent, isLegalInitPositions, transPolarToCartesian) killzoneRadius = 2.5 * distanceToVisualDegreeRatio isTerminal = IsTerminal(sheepId, wolfId, killzoneRadius) numMDPTimeStepPerSecond = 5 maxRunningSteps = 25 * numMDPTimeStepPerSecond sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, forwardOneStep) numActionDirections = 8 actionSpace = [ (np.cos(directionId * 2 * math.pi / numActionDirections), np.sin(directionId * 2 * math.pi / numActionDirections)) for directionId in range(numActionDirections) ] randomPolicy = RandomPolicy(actionSpace) sampleAction = lambda state: sampleFromDistribution( randomPolicy(state)) trajectory = sampleTrajecoty(sampleAction) trajectories.append(trajectory) return trajectories
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = [50, 50] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution numSimulation = 5 cInit = 100 cBase = 1 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) alpha = 0 C = 3 pWidening = PWidening(alpha, C) expandNewState = ExpandNextState(twoAgentTransit, pWidening) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 10 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)] #print(findCumulativeReward(trajectories)) print(trajectories) DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateObstacle2', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) # generate demo image screenWidth = 600 screenHeight = 600 screen = pg.display.set_mode((screenWidth, screenHeight)) screenColor = THECOLORS['black'] xBoundary = [0, 600] yBoundary = [0, 600] lineColor = THECOLORS['white'] lineWidth = 4 xSwamp = [300, 400] ySwamp = [300, 400] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth, xSwamp, ySwamp) fps = 40 circleColorSpace = np.array([[0, 0, 255], [0, 255, 255]]) circleSize = 10 positionIndex = [0, 1] saveImage = True imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep') if not os.path.exists(imageSavePath): os.makedirs(imageSavePath) trajectoryParameters = 'obstacle' imageFolderName = str(trajectoryParameters) saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName)) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) agentIdsToDraw = list(range(2)) drawState = DrawState(fps, screen, circleColorSpace, circleSize, agentIdsToDraw, positionIndex, saveImage, saveImageDir, drawBackground) numFramesToInterpolate = 3 interpolateState = InterpolateState(numFramesToInterpolate, twoAgentTransit) stateIndexInTimeStep = 0 actionIndexInTimeStep = 1 chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState, interpolateState, actionIndexInTimeStep) [chaseTrial(trajectory) for trajectory in trajectories] pg.quit()
def mctsPolicy(): lowerBound = 0 gridSize = 10 upperBound = [gridSize - 1, gridSize - 1] maxRunningSteps = 50 actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1)] numActionSpace = len(actionSpace) sheepSpeedRatio = 1 sheepActionSpace = list(map(tuple, np.array(actionSpace) * sheepSpeedRatio)) numOfAgent = 5 # 1 hunter, 1 stag with high value, 3 rabbits with low value hunterId = [0] targetIds = [1, 2, 3, 4] stagId = [1] rabbitId = [2, 3, 4] positionIndex = [0, 1] getHunterPos = GetAgentPosFromState(hunterId, positionIndex) getTargetsPos = GetAgentPosFromState(targetIds, positionIndex) stayWithinBoundary = env.StayWithinBoundary(upperBound, lowerBound) isTerminal = env.IsTerminal(getHunterPos, getTargetsPos) transitionFunction = env.Transition(stayWithinBoundary) reset = env.Reset(upperBound, lowerBound, numOfAgent) # stagPolicy = RandomPolicy(sheepActionSpace) stagPolicy = stationaryAgentPolicy rabbitPolicies = [stationaryAgentPolicy] * len(rabbitId) cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) getActionPrior = lambda state: { action: 1 / len(actionSpace) for action in actionSpace } def wolfTransit(state, action): return transitionFunction( state, [action, maxFromDistribution(stagPolicy(state))] + [ maxFromDistribution(rabbitPolicy(state)) for rabbitPolicy in rabbitPolicies ]) stepPenalty = -1 / maxRunningSteps catchBonus = 1 highRewardRatio = 10 rewardFunction = reward.RewardFunction(highRewardRatio, stepPenalty, catchBonus, isTerminal) initializeChildren = InitializeChildren(actionSpace, wolfTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) def rolloutPolicy(state): return actionSpace[np.random.choice(range(numActionSpace))] rolloutHeuristicWeight = 0 rolloutHeuristic = reward.HeuristicDistanceToTarget( rolloutHeuristicWeight, getHunterPos, getTargetsPos) # rolloutHeuristic = lambda state: 0 maxRolloutSteps = 20 rollout = RollOut(rolloutPolicy, maxRolloutSteps, wolfTransit, rewardFunction, isTerminal, rolloutHeuristic) numSimulations = 600 wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup, establishSoftmaxActionDist) # All agents' policies policy = lambda state: [ wolfPolicy(state), stagPolicy(state) ] + [rabbitPolicy(state) for rabbitPolicy in rabbitPolicies] numOfAgent = 5 gridSize = 15 maxRunningSteps = 50 screenWidth = 600 screenHeight = 600 fullScreen = False initializeScreen = InitializeScreen(screenWidth, screenHeight, fullScreen) screen = initializeScreen() # pg.mouse.set_visible(False) leaveEdgeSpace = 2 lineWidth = 1 backgroundColor = [205, 255, 204] lineColor = [0, 0, 0] distractorColor = [255, 255, 0] targetColor = [221, 160, 221] playerColor = [50, 50, 255] targetRadius = 10 playerRadius = 10 textColorTuple = (255, 50, 50) drawBackground = DrawBackground(screen, gridSize, leaveEdgeSpace, backgroundColor, lineColor, lineWidth, textColorTuple) drawNewState = DrawNewState(screen, drawBackground, targetColor, playerColor, targetRadius, playerRadius) chooseAction = [maxFromDistribution] * numOfAgent renderOn = True sampleTrajectory = SampleTrajectory(maxRunningSteps, transitionFunction, isTerminal, reset, chooseAction, renderOn, drawNewState) startTime = time.time() numOfEpisodes = 10 trajectories = [sampleTrajectory(policy) for i in range(numOfEpisodes)] finshedTime = time.time() - startTime print('lenght:', len(trajectories[0])) print('time:', finshedTime)
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300,400],[300,400]]] noise = [1, 1] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [200, 200] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld(transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral(transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset(xBoundaryReset, yBoundaryReset, numOfAgent, target) actionSpace = [[10, 0], [-10, 0], [-10, -10], [10, 10], [0, 10], [0, -10], [-10, 10], [10, -10]] actionCost = -1 swampPenalty = -10 terminalReward = 10 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() sampleAction = SampleFromDistribution(actionDistribution) trajectories = [sampleTrajecoty(sampleAction) for _ in range(10)] DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateObstacle', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) # generate demo image screenWidth = 600 screenHeight = 600 screen = pg.display.set_mode((screenWidth, screenHeight)) screenColor = THECOLORS['black'] xBoundary = [0, 600] yBoundary = [0, 600] lineColor = THECOLORS['white'] lineWidth = 4 xSwamp=[300,400] ySwamp=[300,400] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth, xSwamp, ySwamp) fps=40 circleColorSpace = np.array([[0, 0, 255], [0, 255, 255] ]) circleSize = 10 positionIndex = [0, 1] saveImage = True imageSavePath = os.path.join(trajectoryDirectory, 'picMovingSheep') if not os.path.exists(imageSavePath): os.makedirs(imageSavePath) trajectoryParameters = 'obstacle' imageFolderName = str(trajectoryParameters) saveImageDir = os.path.join(os.path.join(imageSavePath, imageFolderName)) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) agentIdsToDraw = list(range(2)) drawState = DrawState(fps, screen, circleColorSpace, circleSize, agentIdsToDraw, positionIndex, saveImage, saveImageDir, drawBackground) numFramesToInterpolate = 3 interpolateState = InterpolateState(numFramesToInterpolate, twoAgentTransit) stateIndexInTimeStep = 0 actionIndexInTimeStep = 1 chaseTrial = ChaseTrialWithTraj(stateIndexInTimeStep, drawState, interpolateState, actionIndexInTimeStep) [chaseTrial(trajectory) for trajectory in trajectories] pg.quit()
def __call__(self, parameters): print(parameters) numSimulation = parameters['numSimulation'] xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = parameters['noise'] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld( multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory( twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution # numSimulation = 50 cInit = 100 cBase = 1 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 100 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): trajectory = sampleTrajecoty(sampleAction) trajectoriesWithIntentionDists.append(trajectory) print(trajectoriesWithIntentionDists) trajectoryFixedParameters = {'Algorithm': "MCTS"} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters)
initializeScreen = InitializeScreen(screenWidth, screenHeight, fullScreen) screen = initializeScreen() # pg.mouse.set_visible(False) leaveEdgeSpace = 2 lineWidth = 1 backgroundColor = [205, 255, 204] lineColor = [0, 0, 0] targetColor = [255, 50, 50] playerColor = [50, 50, 255] targetRadius = 10 playerRadius = 10 textColorTuple = (255, 50, 50) drawBackground = DrawBackground(screen, gridSize, leaveEdgeSpace, backgroundColor, lineColor, lineWidth, textColorTuple) drawNewState = DrawNewState(screen, drawBackground, targetColor, playerColor, targetRadius, playerRadius) chooseAction = [maxFromDistribution] * numOfAgent renderOn = True sampleTrajectory = SampleTrajectory(maxRunningSteps, transitionFunction, isTerminal, reset, chooseAction, renderOn, drawNewState) startTime = time.time() numOfEpisodes = 30 trajectories = [sampleTrajectory(policy) for i in range(numOfEpisodes)] finshedTime = time.time() - startTime print('lenght:', len(trajectories[0])) print('time:', finshedTime)
def __call__(self, parameters): print(parameters) numSimulation = parameters['numSimulation'] rolloutHeuristic = parameters['rolloutHeuristic'] xBoundary = [0, 400] yBoundary = [0, 400] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[175, 225], [175, 225]]] noise = parameters['noise'] cBase = parameters['cBase'] maxRolloutStep = parameters['maxRolloutStep'] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 40 target = [400, 400] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld( multiAgentTransition, target) maxRolloutStep = 15 numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 400], [0, 400], numOfAgent, target) actionSpace = [(50, 0), (-50, 0), (0, 50), (0, -50)] actionCost = -0.02 swampPenalty = -0.5 terminalReward = 1 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 50 oneStepSampleTrajectory = OneStepSampleTrajectory( twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() cInit = 1 #cBase =100 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) def rolloutHeuristic(allState): [state, terminalPosition] = allState distanceToTerminal = np.linalg.norm(np.array(target) - np.array(state), ord=2) return (2 * np.exp(-distanceToTerminal / 100) - 1) # rolloutHeuristic = lambda state: 0#reward return sometimes grab nothing. #maxRolloutStep = 15 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): trajectory = sampleTrajecoty(sampleAction) trajectoriesWithIntentionDists.append(trajectory) #print(trajectoriesWithIntentionDists) trajectoryFixedParameters = {'Algorithm': "MCTSNew"} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters)
def main(): # MDP Env xBoundary = [0, 600] yBoundary = [0, 600] xSwamp = [300, 400] ySwamp = [300, 400] swamp = [[[300, 400], [300, 400]]] noise = [50, 50] stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transitionWithNoise = TransitionWithNoise(noise) minDistance = 50 target = [600, 600] isTerminal = IsTerminal(minDistance, target) isInSwamp = IsInSwamp(swamp) singleAgentTransit = MovingAgentTransitionInSwampWorld( transitionWithNoise, stayInBoundaryByReflectVelocity, isTerminal) transitionFunctionPack = [singleAgentTransit, static] multiAgentTransition = MultiAgentTransitionInGeneral( transitionFunctionPack) twoAgentTransit = MultiAgentTransitionInSwampWorld(multiAgentTransition, target) numOfAgent = 2 xBoundaryReset = [500, 600] yBoundaryReset = [0, 100] resetState = Reset([0, 0], [0, 0], numOfAgent, target) actionSpace = [(100, 0), (-100, 0), (0, 100), (0, -100)] #k = np.random.choice(actionSpace) #print(k) actionCost = -1 swampPenalty = -100 terminalReward = 1000 rewardFunction = RewardFunction(actionCost, terminalReward, swampPenalty, isTerminal, isInSwamp) maxRunningSteps = 100 oneStepSampleTrajectory = OneStepSampleTrajectory(twoAgentTransit, rewardFunction) sampleTrajecoty = SampleTrajectory(maxRunningSteps, isTerminal, resetState, oneStepSampleTrajectory) randomPolicy = RandomPolicy(actionSpace) actionDistribution = randomPolicy() #numSimulation, selectAction, selectNextState, expand, estimateValue, backup, outputDistribution numSimulation = 50 cInit = 1 cBase = 100 scoreChild = ScoreChild(cInit, cBase) selectAction = SelectAction(scoreChild) selectNextState = SelectNextState(selectAction) uniformActionPrior = {action: 1 / 4 for action in actionSpace} getActionPrior = lambda state: uniformActionPrior initializeChildren = InitializeChildren(actionSpace, twoAgentTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) expandNewState = ExpandNextState(twoAgentTransit) rolloutPolicy = lambda state: random.choice(actionSpace) rolloutHeuristic = lambda state: 0 #reward return sometimes grab nothing. maxRolloutStep = 100 estimateValue = RollOut(rolloutPolicy, maxRolloutStep, twoAgentTransit, rewardFunction, isTerminal, rolloutHeuristic) mctsSelectAction = MCTS(numSimulation, selectAction, selectNextState, expand, expandNewState, estimateValue, backup, establishPlainActionDist) #sampleAction = SampleFromDistribution(actionDictionary) def sampleAction(state): actionDist = mctsSelectAction(state) action = maxFromDistribution(actionDist) return action trajectories = [sampleTrajecoty(sampleAction) for _ in range(1)] DIRNAME = os.path.dirname(__file__) trajectoryDirectory = os.path.join(DIRNAME, '..', '..', 'data', 'evaluateCompeteDetection', 'trajectories') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) print(trajectories) print(findCumulativeReward(trajectories))
def main(): DEBUGForParameterInMain = 1 # can be deleted is you are familiar with running code with parameters using command line if DEBUGForParameterInMain: parametersForTrajectoryPath = {} startSampleIndex = 0 endSampleIndex = 2 numSheep = 2 parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) else: parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) agentId = int(parametersForTrajectoryPath['numSheep']) parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) # check file exists or not dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', '..', 'data', 'generateGuidedMCTSWeWithRolloutBothWolfSheep', 'OneLeveLPolicy', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectorySaveExtension = '.pickle' numOneWolfActionSpace = 9 NNNumSimulations = 200 numWolves = 3 maxRunningSteps = 100 softParameterInPlanning = 2.5 sheepPolicyName = 'maxNNPolicy' wolfPolicyName = 'maxNNPolicy' trajectoryFixedParameters = { 'priorType': 'uniformPrior', 'sheepPolicy': sheepPolicyName, 'wolfPolicy': wolfPolicyName, 'NNNumSimulations': NNNumSimulations, 'policySoftParameter': softParameterInPlanning, 'maxRunningSteps': maxRunningSteps, 'numOneWolfActionSpace': numOneWolfActionSpace, 'numSheep': numSheep, 'numWolves': numWolves } generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, trajectoryFixedParameters) trajectorySavePath = generateTrajectorySavePath( parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): # main function as usual # xxx, xxx sampleTrajectory = SampleTrajectory(xxx, xxx) trajectories = [ sampleTrajectory(policy) for trjaectoryIndex in range(startSampleIndex, endSampleIndex) ] saveToPickle(trajectories, trajectorySavePath)