def main(): actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]] numActionSpace = len(actionSpace) numStateSpace = 4 initSheepPosition = np.array([180, 180]) initWolfPosition = np.array([180, 180]) initSheepVelocity = np.array([0, 0]) initWolfVelocity = np.array([0, 0]) initSheepPositionNoise = np.array([90, 150]) initWolfPositionNoise = np.array([0, 60]) sheepPositionReset = ag.SheepPositionReset(initSheepPosition, initSheepPositionNoise) wolfPositionReset = ag.WolfPositionReset(initWolfPosition, initWolfPositionNoise) numOneAgentState = 2 positionIndex = [0, 1] xBoundary = [0, 360] yBoundary = [0, 360] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) sheepPositionTransition = ag.SheepPositionTransition( numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) numAgent = 2 sheepId = 0 wolfId = 1 transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 15 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = False saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = -1 deathPenalty = 20 rewardDecay = 0.99 rewardFunction = reward.RewardFunctionTerminalPenalty( sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) accumulateReward = AccumulateReward(rewardDecay, rewardFunction)
def main(): # action space actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]] numActionSpace = len(actionSpace) # state space numStateSpace = 4 xBoundary = [0, 360] yBoundary = [0, 360] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) initSheepPositionMean = np.array([180, 180]) initWolfPositionMean = np.array([180, 180]) initSheepPositionNoise = np.array([120, 120]) initWolfPositionNoise = np.array([60, 60]) sheepPositionReset = ag.SheepPositionReset(initSheepPositionMean, initSheepPositionNoise, checkBoundaryAndAdjust) wolfPositionReset = ag.WolfPositionReset(initWolfPositionMean, initWolfPositionNoise, checkBoundaryAndAdjust) numOneAgentState = 2 positionIndex = [0, 1] sheepPositionTransition = ag.SheepPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) numAgent = 2 sheepId = 0 wolfId = 1 transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 15 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = False saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = -1 deathPenalty = 20 rewardDecay = 0.99 rewardFunction = reward.TerminalPenalty(sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) accumulateRewards = PG.AccumulateRewards(rewardDecay, rewardFunction) maxTimeStep = 150 sampleTrajectory = PG.SampleTrajectory(maxTimeStep, transitionFunction, isTerminal) approximatePolicy = PG.ApproximatePolicy(actionSpace) trainPG = PG.TrainTensorflow(actionSpace) numTrajectory = 20 maxEpisode = 1000 # Generate models. learningRate = 1e-4 hiddenNeuronNumbers = [128, 256, 512, 1024] hiddenDepths = [2, 4, 8] # hiddenNeuronNumbers = [128] # hiddenDepths = [2] generateModel = GeneratePolicyNet(numStateSpace, numActionSpace, learningRate) models = {(n, d): generateModel(d, round(n / d)) for n, d in it.product(hiddenNeuronNumbers, hiddenDepths)} print("Models generated") # Train. policyGradient = PG.PolicyGradient(numTrajectory, maxEpisode, render) trainModel = lambda model: policyGradient(model, approximatePolicy, sampleTrajectory, accumulateRewards, trainPG) trainedModels = {key: trainModel(model) for key, model in models.items()} print("Finished training") # Evaluate modelEvaluate = Evaluate(numTrajectory, approximatePolicy, sampleTrajectory, rewardFunction) meanEpisodeRewards = {key: modelEvaluate(model) for key, model in trainedModels.items()} print("Finished evaluating") # print(meanEpisodeRewards) # Visualize independentVariableNames = ['NeuroTotalNumber', 'layerNumber'] draw(meanEpisodeRewards, independentVariableNames) print("Finished visualizing", meanEpisodeRewards)
def main(): #tf.set_random_seed(123) #np.random.seed(123) actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]] numActionSpace = len(actionSpace) numStateSpace = 4 numActorFC1Unit = 50 numActorFC2Unit = 50 numActorFC3Unit = 50 numActorFC4Unit = 50 numCriticFC1Unit = 100 numCriticFC2Unit = 100 numCriticFC3Unit = 100 numCriticFC4Unit = 100 learningRateActor = 1e-4 learningRateCritic = 3e-4 actorGraph = tf.Graph() with actorGraph.as_default(): with tf.name_scope("inputs"): state_ = tf.placeholder(tf.float32, [None, numStateSpace], name="state_") actionLabel_ = tf.placeholder(tf.int32, [None, numActionSpace], name="actionLabel_") advantages_ = tf.placeholder(tf.float32, [ None, ], name="advantages_") with tf.name_scope("hidden"): initWeight = tf.random_uniform_initializer(-0.03, 0.03) initBias = tf.constant_initializer(0.01) fullyConnected1_ = tf.layers.dense(inputs=state_, units=numActorFC1Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) fullyConnected2_ = tf.layers.dense(inputs=fullyConnected1_, units=numActorFC2Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) fullyConnected3_ = tf.layers.dense(inputs=fullyConnected2_, units=numActorFC2Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) allActionActivation_ = tf.layers.dense( inputs=fullyConnected3_, units=numActionSpace, activation=None, kernel_initializer=initWeight, bias_initializer=initBias) with tf.name_scope("outputs"): actionDistribution_ = tf.nn.softmax(allActionActivation_, name='actionDistribution_') actionEntropy_ = tf.multiply(tfp.distributions.Categorical( probs=actionDistribution_).entropy(), 1, name='actionEntropy_') negLogProb_ = tf.nn.softmax_cross_entropy_with_logits_v2( logits=allActionActivation_, labels=actionLabel_, name='negLogProb_') loss_ = tf.reduce_mean(tf.multiply(negLogProb_, advantages_), name='loss_') actorLossSummary = tf.summary.scalar("ActorLoss", loss_) with tf.name_scope("train"): trainOpt_ = tf.train.AdamOptimizer(learningRateActor, name='adamOpt_').minimize(loss_) actorInit = tf.global_variables_initializer() actorModel = tf.Session(graph=actorGraph) actorModel.run(actorInit) criticGraph = tf.Graph() with criticGraph.as_default(): with tf.name_scope("inputs"): state_ = tf.placeholder(tf.float32, [None, numStateSpace], name="state_") valueTarget_ = tf.placeholder(tf.float32, [None, 1], name="valueTarget_") with tf.name_scope("hidden"): initWeight = tf.random_uniform_initializer(-0.03, 0.03) initBias = tf.constant_initializer(0.001) fullyConnected1_ = tf.layers.dense(inputs=state_, units=numActorFC1Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) fullyConnected2_ = tf.layers.dense(inputs=fullyConnected1_, units=numActorFC2Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) fullyConnected3_ = tf.layers.dense(inputs=fullyConnected2_, units=numActorFC3Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) fullyConnected4_ = tf.layers.dense(inputs=fullyConnected3_, units=numActorFC4Unit, activation=tf.nn.relu, kernel_initializer=initWeight, bias_initializer=initBias) with tf.name_scope("outputs"): value_ = tf.layers.dense(inputs=fullyConnected4_, units=1, activation=None, name='value_', kernel_initializer=initWeight, bias_initializer=initBias) diff_ = tf.subtract(valueTarget_, value_, name='diff_') loss_ = tf.reduce_mean(tf.square(diff_), name='loss_') criticLossSummary = tf.summary.scalar("CriticLoss", loss_) with tf.name_scope("train"): trainOpt_ = tf.train.AdamOptimizer(learningRateCritic, name='adamOpt_').minimize(loss_) criticInit = tf.global_variables_initializer() criticModel = tf.Session(graph=criticGraph) criticModel.run(criticInit) xBoundary = [0, 360] yBoundary = [0, 360] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) initSheepPosition = np.array([180, 180]) initWolfPosition = np.array([180, 180]) initSheepVelocity = np.array([0, 0]) initWolfVelocity = np.array([0, 0]) initSheepPositionNoise = np.array([60, 120]) initWolfPositionNoise = np.array([0, 60]) sheepPositionReset = ag.SheepPositionReset(initSheepPosition, initSheepPositionNoise) wolfPositionReset = ag.WolfPositionReset(initWolfPosition, initWolfPositionNoise) numOneAgentState = 2 positionIndex = [0, 1] sheepPositionTransition = ag.SheepPositionTransition( numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) numAgent = 2 sheepId = 0 wolfId = 1 transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 15 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = False saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = -1 deathPenalty = 20 rewardDecay = 0.99 rewardFunction = reward.RewardFunctionTerminalPenalty( sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) accumulateReward = AccumulateReward(rewardDecay, rewardFunction) maxTimeStep = 150 sampleTrajectory = SampleTrajectory(maxTimeStep, transitionFunction, isTerminal) approximatePolicy = ApproximatePolicy(actionSpace) trainCritic = TrainCriticMonteCarloTensorflow(accumulateReward) estimateAdvantage = EstimateAdvantageMonteCarlo(accumulateReward) trainActor = TrainActorMonteCarloTensorflow(actionSpace) numTrajectory = 50 maxEpisode = 602 actorCritic = OfflineAdvantageActorCritic(numTrajectory, maxEpisode, render) trainedActorModel, trainedCriticModel = actorCritic( actorModel, criticModel, approximatePolicy, sampleTrajectory, trainCritic, approximateValue, estimateAdvantage, trainActor) savePathActor = 'data/tmpModelActor.ckpt' savePathCritic = 'data/tmpModelCritic.ckpt' with actorModel.as_default(): actorSaver.save(trainedActorModel, savePathActor) with criticModel.as_default(): criticSaver.save(trainedCriticModel, savePathCritic)
def evaluate(cInit, cBase): actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] numActionSpace = len(actionSpace) getActionPrior = GetActionPrior(actionSpace) numStateSpace = 4 initSheepPosition = np.array([90, 90]) initWolfPosition = np.array([90, 90]) initSheepVelocity = np.array([0, 0]) initWolfVelocity = np.array([0, 0]) initSheepPositionNoise = np.array([40, 60]) initWolfPositionNoise = np.array([0, 20]) sheepPositionReset = ag.SheepPositionReset(initSheepPosition, initSheepPositionNoise) wolfPositionReset = ag.WolfPositionReset(initWolfPosition, initWolfPositionNoise) numOneAgentState = 2 positionIndex = [0, 1] xBoundary = [0, 180] yBoundary = [0, 180] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) sheepPositionTransition = ag.SheepPositionTransition( numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfSpeed = 7 wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust, wolfSpeed) numAgent = 2 sheepId = 0 wolfId = 1 transition = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 10 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = True saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = 0.05 deathPenalty = -1 rewardFunction = reward.RewardFunctionTerminalPenalty( sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) # Hyper-parameters numSimulations = 600 maxRunningSteps = 70 # MCTS algorithm # Select child calculateScore = CalculateScore(cInit, cBase) selectChild = SelectChild(calculateScore) # expand initializeChildren = InitializeChildren(actionSpace, transition, getActionPrior) expand = Expand(transition, isTerminal, initializeChildren) #selectNextRoot = selectNextRoot # Rollout rolloutPolicy = lambda state: actionSpace[np.random.choice( range(numActionSpace))] maxRollOutSteps = 50 rollout = RollOut(rolloutPolicy, maxRollOutSteps, transition, rewardFunction, isTerminal) mcts = MCTS(numSimulations, selectChild, expand, rollout, backup, selectNextRoot) runMCTS = RunMCTS(mcts, maxRunningSteps, isTerminal, render) rootAction = actionSpace[np.random.choice(range(numActionSpace))] numTestingIterations = 70 episodeLengths = [] for step in range(numTestingIterations): import datetime print(datetime.datetime.now()) state, action = None, None initState = transition(state, action) #optimal = math.ceil((np.sqrt(np.sum(np.power(initState[0:2] - initState[2:4], 2))) - minDistance )/10) rootNode = Node(id={rootAction: initState}, num_visited=0, sum_value=0, is_expanded=True) episodeLength = runMCTS(rootNode) episodeLengths.append(episodeLength) meanEpisodeLength = np.mean(episodeLengths) print("mean episode length is", meanEpisodeLength) return [meanEpisodeLength]
def evaluate(numTree, chasingSubtlety, numTotalSimulationTimes, cInit, cBase): print(numTree, chasingSubtlety, numTotalSimulationTimes, cInit, cBase) numActionSpace = 8 actionInterval = int(360 / numActionSpace) actionSpace = [ (np.cos(degreeInPolar), np.sin(degreeInPolar)) for degreeInPolar in np.arange(0, 360, actionInterval) / 180 * math.pi ] getActionPrior = GetActionPrior(actionSpace) # 2D Env initSheepPosition = np.array([320, 240]) initSheepPositionNoise = np.array([0, 0]) resetSheepState = ag.ResetAgentState(initSheepPosition, initSheepPositionNoise) initWolfOrDistractorPosition = np.array([320, 240]) initWolfOrDistractorPositionNoise = np.array([125, 230]) resetWolfOrDistractorState = ag.ResetAgentState( initWolfOrDistractorPosition, initWolfOrDistractorPositionNoise) numAgent = 25 sheepId = 0 suspectorIds = list(range(1, numAgent)) resetWolfIdAndSubtlety = ag.ResetWolfIdAndSubtlety(suspectorIds, [chasingSubtlety]) resetPhysicalState = ag.ResetPhysicalState(sheepId, numAgent, resetSheepState, resetWolfOrDistractorState, resetWolfIdAndSubtlety) numFramePerSecond = 60 numMDPTimeStepPerSecond = 5 numFrameWithoutActionChange = int(numFramePerSecond / numMDPTimeStepPerSecond) sheepActionUpdateFrequency = 1 distanceToVisualDegreeRatio = 20 minSheepSpeed = int(17.4 * distanceToVisualDegreeRatio / numFramePerSecond) maxSheepSpeed = int(23.2 * distanceToVisualDegreeRatio / numFramePerSecond) warmUpTimeSteps = int(10 * numMDPTimeStepPerSecond) sheepPolicy = ag.SheepPolicy(sheepActionUpdateFrequency, minSheepSpeed, maxSheepSpeed, warmUpTimeSteps) wolfActionUpdateFrequency = int(0.2 * numMDPTimeStepPerSecond) minWolfSpeed = int(8.7 * distanceToVisualDegreeRatio / numFramePerSecond) maxWolfSpeed = int(14.5 * distanceToVisualDegreeRatio / numFramePerSecond) wolfPolicy = ag.WolfPolicy(wolfActionUpdateFrequency, minWolfSpeed, maxWolfSpeed, warmUpTimeSteps) distractorActionUpdateFrequency = int(0.2 * numMDPTimeStepPerSecond) minDistractorSpeed = int(8.7 * distanceToVisualDegreeRatio / numFramePerSecond) maxDistractorSpeed = int(14.5 * distanceToVisualDegreeRatio / numFramePerSecond) distractorPolicy = ag.DistractorPolicy(distractorActionUpdateFrequency, minDistractorSpeed, maxDistractorSpeed, warmUpTimeSteps) preparePolicy = ag.PreparePolicy(sheepId, numAgent, sheepPolicy, wolfPolicy, distractorPolicy) updatePhysicalState = ag.UpdatePhysicalState(numAgent, preparePolicy) xBoundary = [0, 640] yBoundary = [0, 480] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) transiteMultiAgentMotion = ag.TransiteMultiAgentMotion( checkBoundaryAndAdjust) minDistance = 2.5 * distanceToVisualDegreeRatio isTerminal = env.IsTerminal(sheepId, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) #screen = None screenColor = np.array([255, 255, 255]) sheepColor = np.array([0, 255, 0]) wolfColor = np.array([255, 0, 0]) circleSize = 10 saveImage = True saveImageFile = 'image1' render = env.Render(numAgent, screen, screenColor, sheepColor, wolfColor, circleSize, saveImage, saveImageFile) renderOnInSimulation = False transiteStateWithoutActionChangeInSimulation = env.TransiteStateWithoutActionChange( numFrameWithoutActionChange, isTerminal, transiteMultiAgentMotion, render, renderOnInSimulation) renderOnInPlay = True transiteStateWithoutActionChangeInPlay = env.TransiteStateWithoutActionChange( numFrameWithoutActionChange, isTerminal, transiteMultiAgentMotion, render, renderOnInPlay) attentionLimitation = 4 precisionPerSlot = 8.0 precisionForUntracked = 2.5 memoryratePerSlot = 0.7 memoryrateForUntracked = 0.45 attention = Attention.AttentionToPrecisionAndDecay(precisionPerSlot, precisionForUntracked, memoryratePerSlot, memoryrateForUntracked) transferMultiAgentStatesToPositionDF = ba.TransferMultiAgentStatesToPositionDF( numAgent) possibleSubtleties = [500, 11, 3.3, 1.83, 0.92, 0.31] resetBeliefAndAttention = ba.ResetBeliefAndAttention( sheepId, suspectorIds, possibleSubtleties, attentionLimitation, transferMultiAgentStatesToPositionDF, attention) maxDistance = 7.5 * distanceToVisualDegreeRatio numStandardErrorInDistanceRange = 2 calDistancePriorOnAttentionSlot = Attention.CalDistancePriorOnAttentionSlot( minDistance, maxDistance, numStandardErrorInDistanceRange) attentionSwitch = Attention.AttentionSwitch( attentionLimitation, calDistancePriorOnAttentionSlot) computePosterior = calPosterior.CalPosteriorLog(minDistance) attentionSwitchFrequencyInSimulation = np.inf beliefUpdateFrequencyInSimulation = np.inf updateBeliefAndAttentionInSimulation = ba.UpdateBeliefAndAttentionState( attention, computePosterior, attentionSwitch, transferMultiAgentStatesToPositionDF, attentionSwitchFrequencyInSimulation, beliefUpdateFrequencyInSimulation) attentionSwitchFrequencyInPlay = int(0.6 * numMDPTimeStepPerSecond) beliefUpdateFrequencyInPlay = int(0.2 * numMDPTimeStepPerSecond) updateBeliefAndAttentionInPlay = ba.UpdateBeliefAndAttentionState( attention, computePosterior, attentionSwitch, transferMultiAgentStatesToPositionDF, attentionSwitchFrequencyInPlay, beliefUpdateFrequencyInPlay) updatePhysicalStateByBeliefFrequencyInSimulationRoot = int( 0.2 * numMDPTimeStepPerSecond) updatePhysicalStateByBeliefInSimulationRoot = ba.UpdatePhysicalStateImagedByBelief( updatePhysicalStateByBeliefFrequencyInSimulationRoot) updatePhysicalStateByBeliefFrequencyInSimulation = np.inf updatePhysicalStateByBeliefInSimulation = ba.UpdatePhysicalStateImagedByBelief( updatePhysicalStateByBeliefFrequencyInSimulation) updatePhysicalStateByBeliefFrequencyInPlay = np.inf updatePhysicalStateByBeliefInPlay = ba.UpdatePhysicalStateImagedByBelief( updatePhysicalStateByBeliefFrequencyInPlay) transitionFunctionInSimulation = env.TransitionFunction( resetPhysicalState, resetBeliefAndAttention, updatePhysicalState, transiteStateWithoutActionChangeInSimulation, updateBeliefAndAttentionInSimulation, updatePhysicalStateByBeliefInSimulation) transitionFunctionInPlay = env.TransitionFunction( resetPhysicalState, resetBeliefAndAttention, updatePhysicalState, transiteStateWithoutActionChangeInPlay, updateBeliefAndAttentionInPlay, updatePhysicalStateByBeliefInPlay) maxRollOutSteps = 5 aliveBouns = 1 / maxRollOutSteps deathPenalty = -1 rewardFunction = reward.RewardFunctionTerminalPenalty( sheepId, aliveBouns, deathPenalty, isTerminal) # MCTS algorithm # Select child calculateScore = CalculateScore(cInit, cBase) selectChild = SelectChild(calculateScore) # expand initializeChildren = InitializeChildren(actionSpace, transitionFunctionInSimulation, getActionPrior) expand = Expand(isTerminal, initializeChildren) # Rollout rolloutPolicy = lambda state: actionSpace[np.random.choice( range(numActionSpace))] rollout = RollOut(rolloutPolicy, maxRollOutSteps, transitionFunctionInSimulation, rewardFunction, isTerminal) numActionPlaned = 1 selectAction = SelectAction(numActionPlaned, actionSpace) numSimulations = int(numTotalSimulationTimes / numTree) sheepColorInMcts = np.array([0, 255, 0]) wolfColorInMcts = np.array([255, 0, 0]) distractorColorInMcts = np.array([0, 0, 0]) mctsRender = env.MctsRender(numAgent, screen, xBoundary[1], yBoundary[1], screenColor, sheepColorInMcts, wolfColorInMcts, distractorColorInMcts, circleSize, saveImage, saveImageFile) mctsRenderOn = True mcts = MCTS(numSimulations, selectChild, expand, rollout, backup, selectAction, mctsRender, mctsRenderOn) maxRunningSteps = int(25 * numMDPTimeStepPerSecond) makeDiffSimulationRoot = MakeDiffSimulationRoot( isTerminal, updatePhysicalStateByBeliefInSimulationRoot) runMCTS = RunMCTS(maxRunningSteps, numTree, numActionPlaned, transitionFunctionInPlay, isTerminal, makeDiffSimulationRoot, render) rootAction = actionSpace[np.random.choice(range(numActionSpace))] numTestingIterations = 1 episodeLengths = [] escape = 0 step = 1 while step <= numTestingIterations: import datetime print(datetime.datetime.now()) episodeLength = runMCTS(mcts) if episodeLength >= 1 * numMDPTimeStepPerSecond: step = step + 1 episodeLengths.append(episodeLength) if episodeLength >= maxRunningSteps - 10: escape = escape + 1 meanEpisodeLength = np.mean(episodeLengths) print("mean episode length is", meanEpisodeLength, escape / numTestingIterations) return [meanEpisodeLength, escape / numTestingIterations]
def main(): #tf.set_random_seed(123) #np.random.seed(123) actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]] numActionSpace = len(actionSpace) numStateSpace = 4 xBoundary = [0, 360] yBoundary = [0, 360] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) initSheepPosition = np.array([180, 180]) initWolfPosition = np.array([180, 180]) initSheepVelocity = np.array([0, 0]) initWolfVelocity = np.array([0, 0]) initSheepPositionNoise = np.array([120, 120]) initWolfPositionNoise = np.array([60, 60]) sheepPositionReset = ag.SheepPositionReset(initSheepPosition, initSheepPositionNoise, checkBoundaryAndAdjust) wolfPositionReset = ag.WolfPositionReset(initWolfPosition, initWolfPositionNoise, checkBoundaryAndAdjust) numOneAgentState = 2 positionIndex = [0, 1] sheepPositionTransition = ag.SheepPositionTransition( numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) numAgent = 2 sheepId = 0 wolfId = 1 transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 15 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = False saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = -1 deathPenalty = 20 rewardDecay = 0.99 rewardFunction = reward.RewardFunctionTerminalPenalty( sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) accumulateReward = A2CMC.AccumulateReward(rewardDecay, rewardFunction) maxTimeStep = 150 sampleTrajectory = A2CMC.SampleTrajectory(maxTimeStep, transitionFunction, isTerminal) approximatePolicy = A2CMC.ApproximatePolicy(actionSpace) approximateValue = A2CMC.approximateValue trainCritic = A2CMC.TrainCriticMonteCarloTensorflow(accumulateReward) estimateAdvantage = A2CMC.EstimateAdvantageMonteCarlo(accumulateReward) trainActor = A2CMC.TrainActorMonteCarloTensorflow(actionSpace) numTrajectory = 5 maxEpisode = 1 actorCritic = A2CMC.OfflineAdvantageActorCritic(numTrajectory, maxEpisode, render) # Generate models. learningRateActor = 1e-4 learningRateCritic = 3e-4 hiddenNeuronNumbers = [128, 256, 512, 1024] hiddenDepths = [2, 4, 8] generateModel = GenerateActorCriticModel(numStateSpace, numActionSpace, learningRateActor, learningRateCritic) modelDict = {(n, d): generateModel(d, round(n / d)) for n, d in it.product(hiddenNeuronNumbers, hiddenDepths)} print("Generated graphs") # Train. actorCritic = A2CMC.OfflineAdvantageActorCritic(numTrajectory, maxEpisode, render) modelTrain = lambda actorModel, criticModel: actorCritic( actorModel, criticModel, approximatePolicy, sampleTrajectory, trainCritic, approximateValue, estimateAdvantage, trainActor) trainedModelDict = { key: modelTrain(model[0], model[1]) for key, model in modelDict.items() } print("Finished training") # Evaluate modelEvaluate = Evaluate(numTrajectory, approximatePolicy, sampleTrajectory, rewardFunction) meanEpisodeRewards = { key: modelEvaluate(model[0], model[1]) for key, model in trainedModelDict.items() } print("Finished evaluating") # Visualize independentVariableNames = ['NeuroTotalNumber', 'layerNumber'] draw(meanEpisodeRewards, independentVariableNames)