Exemplo n.º 1
0
Arquivo: Sim.py Projeto: lono175/Mario
def TestSim(obs):
    MaxY = 16
    MaxX = 22
    state = WorldState(obs)
    print "mario loc ", state.mario.x, " ", state.mario.y

    commonVar = getCommonVar()
    classVarList = getClassVar()
    rewardVar = orange.FloatVariable("reward")
    RewardLearner = Learner(commonVar, [rewardVar], 3000)
    commonVar.pop(0)
    DynamicLearner = Learner(commonVar, classVarList, 3000)

    lastActionId = 9
    modelFea = getModelFeature(state, [2.0, 1.0, 0.0, 0.0])
    rewardFea = getTrainFeature(state, [0.0], lastActionId)  # don't learn the pseudo reward

    DynamicLearner.add([modelFea])
    RewardLearner.add([rewardFea])

    dynaLearner = [DynamicLearner for action in range(12)]
    path = Optimize(state, dynaLearner, RewardLearner, 100, [], ActionRange)
    newState = ExpandPath(path, MakeSimState(state, 10), dynaLearner, RewardLearner)
    print type(newState)
    print "hello"
    for world in newState.worldList:
        print "loc: ", world.mario.x
Exemplo n.º 2
0
    def agent_step(self, reward, obs):
        #self.obsList.append(obs)
        #if reward < -0.01 + epsilon and reward > -0.01 - epsilon:
            #reward = -1

        state = WorldState(obs)

        fea = getSarsaFeature(state, self.lastAction)
        lastMario = self.lastState.mario
        mario = state.mario #for internal reward system
        dx = mario.x - lastMario.x

        reward = reward + dx
        modelReward = 0
        if isMarioInPit(state):
            print "in pit !!!!!!!"
            #reward = reward + InPitPenalty #no pit penalty for HORDQ
            modelReward = InPitPenalty
        if not self.isModelReady():
            #fea = getSarsaFeature(obs)
            action = self.agent.step(reward, fea, NoTask)
        else:
            #episilon greey policy
            if random.random() < self.epsilon:
                #select randomly
                action = self.actionList[int(random.random()*len(self.actionList))]
                print "random!!"
            else:
                possibleAction = self.agent.getPossibleAction(fea)
                #if fea[0] == (): #if not monster around, pass control to the planner
                    #possibleAction = self.actionList
                action = self.planning(state, possibleAction)

            print "planning", action
            self.agent.pseudoReward = 10000
            action = self.agent.step(reward, fea, action)
            self.agent.pseudoReward = self.initPseudoReward
        #state.dump()
        print "step loc:",  self.stepNum, " ", mario.x , " ", mario.y, " ", mario.sx, " ", mario.sy
        #state.path = []
        #state.reward = 0

        #nextState, isValid =  ExpandPath([0], state, self.DynamicLearner, self.RewardLearner)

        #nextState.dump()
        #print "pred loc:", nextState.mario.x , " ", nextState.mario.y, " ", nextState.mario.sx, " ", nextState.mario.sy
        #print "backoff reward: ", nextState.reward

        #nextState, isValid =  ExpandPath([action], state, self.DynamicLearner, self.RewardLearner)
        #nextState.dump()
        #print "pred loc:", nextState.mario.x , " ", nextState.mario.y, " ", nextState.mario.sx, " ", nextState.mario.sy
        #print "pred rewar:", action, " ", nextState.reward



        lastActionId = self.lastAction

        deltaX = mario.x - (lastMario.x + lastMario.sx)
        deltaY = mario.y - (lastMario.y + lastMario.sy)
        aX = mario.sx - lastMario.sx 
        aY = mario.sy - lastMario.sy 
        
        classVar = [round(aX, Precision), round(aY, Precision), round(deltaX, Precision), round(deltaY, Precision)]
        rewardClassVar = [round(modelReward, 0)]
        modelFea = getModelFeature(self.lastState, classVar)
        #rewardFea = getTrainFeature(self.lastState, rewardClassVar, lastActionId) #don't learn the pseudo reward

        if self.isModelReady(): #TODO: too dirty

            #predictModelClass = self.DynamicLearner[lastActionId].getClass(modelFea)
            #predictModelClass = [round(v, 1) for v in predictModelClass]
            #print "feature: ", lastActionId, " ", modelFea
            #print "predict: ", predictModelClass
            predictModelClass = self.DynamicLearner[lastActionId].getClass(modelFea)
            predictModelClass = [round(v, 1) for v in predictModelClass]
            roundClassVar = [round(v, 1) for v in classVar]
            print "feature: ", lastActionId, " ", modelFea
            print "predict: ", predictModelClass
            if not roundClassVar == predictModelClass:
                self.feaList[lastActionId].append(modelFea)
            else:
                print "pass model-------------"
        else:
            if not self.AgentType() == AgentType.SarsaAgent:
                self.feaList[lastActionId].append(modelFea)


        rewardFea = getRewardFeature(state, self.lastAction)
        print "before pre reward: ", self.rewardAgent.getQ(rewardFea, action)
        self.rewardAgent.step(rewardFea, modelReward, action)
        print "pre reward: ", self.rewardAgent.getQ(rewardFea, action)
        print "reward: ", modelReward
        #if self.isModelReady():
            #predictRewardClass = self.RewardLearner.getClass(rewardFea)
            #predictRewardClass = [round(v, 0) for v in predictRewardClass]
            #print "reward: ", modelReward
            #print "pre reward: ", predictRewardClass
            #if not rewardClassVar == predictRewardClass:
                #self.rewardFeaList.append(rewardFea)
            #else:
                #print "pass reward-------------"
        #else:
            #if not self.AgentType() == SarsaAgent:
                #self.rewardFeaList.append(rewardFea)

        self.lastState = state
        self.lastLastAction = self.lastAction
        self.lastAction = action

        self.stepNum = self.stepNum + 1


        self.distList[len(self.distList)-1] = (self.totalStep + self.stepNum, self.lastState.mario.x, self.episodeNum, 0)

        return makeAction(action)