def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 helper_vector = util.Counter() # Copy of vectors to be used for batch updating for i in range(self.iterations): for state in mdp.getStates(): if mdp.isTerminal(state): continue if mdp.getPossibleActions(state): helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] ) for action in mdp.getPossibleActions(state): helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, action)] )) for state in helper_vector: self.values[state] = helper_vector[state]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.optimalActionInState = collections.defaultdict(None) for k in range(iterations): lastValues = self.values.copy() for state in mdp.getStates(): if self.mdp.isTerminal(state): continue maxValue = float("-inf") if mdp.getPossibleActions(state) else 0 for action in mdp.getPossibleActions(state): theSum = 0 for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action): R = self.mdp.getReward(state, action, nextState) theSum += prob * (R + self.discount * lastValues[nextState]) maxValue = max(maxValue,theSum) self.values[state] = maxValue
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(iterations): currentValues = self.values.copy() for s in mdp.getStates(): if not self.mdp.isTerminal(s): temp, i = [float("-inf")]*len(mdp.getPossibleActions(s)), 0 for a in mdp.getPossibleActions(s): temp[i], i = self.getQValue(s, a), i + 1 currentValues[s] = max(temp) self.values = currentValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.depth = 1 self.qTable = {} self.vTable = {} for state in mdp.getStates(): self.vTable[state] = 0 self.qTable[state] = {} for action in mdp.getPossibleActions(state): self.qTable[state][action] = 0 while self.depth < self.iterations + 1: self.tempTable = {} for state in mdp.getStates(): self.stateValue = 0 if not mdp.isTerminal(state): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState] self.qTable[state][action] = self.Qtotal #print self.qTable[state][action] self.stateValue = max(self.stateValue,self.qTable[state][action]) else: self.tempTable[state] = 0 self.tempTable[state] = self.stateValue self.vTable = self.tempTable self.depth += 1 for state in mdp.getStates(): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) self.qTable[state][action] = self.Qtotal
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" i = 0 terminalstates = [] while i<iterations: nextValues = util.Counter() for state in mdp.getStates(): stateValues = [] for action in mdp.getPossibleActions(state): sumValue = 0 for item in mdp.getTransitionStatesAndProbs(state, action): nextState = item[0] probability = item[1] reward = mdp.getReward(state,action,nextState) #print "reward", reward sumValue = sumValue + (probability * (reward + (discount * self.values[nextState]))) #print "SUMVALUE", sumValue stateValues.append(sumValue) if len(mdp.getPossibleActions(state)) == 0: nextValues[state] = 0 else: nextValues[state] = max(stateValues) i+=1 self.values = nextValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.delta = 0 while(self.iterations > 0): # self.delta = 0 batchValues = util.Counter() for state in mdp.getStates(): maxM = -10000 if mdp.isTerminal(state): continue for action in mdp.getPossibleActions(state): statesProbs = mdp.getTransitionStatesAndProbs(state, action) sumU = 0 Rs = 0 for stateProb in statesProbs: # if stateProb[0] == 'TERMINAL_STATE': # continue sumU = sumU + self.values[stateProb[0]]*stateProb[1] Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1] # if sumU > maxM: # maxM = sumU v = Rs + sumU * discount if (v > maxM): maxM = v batchValues[state] = maxM self.values = batchValues self.iterations = self.iterations - 1 self.policy = {} for state in mdp.getStates(): if mdp.isTerminal(state): self.policy[state] = None continue QValues = [] for action in mdp.getPossibleActions(state): QValues.append(self.getQValue(state, action)) self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" currentIterationCounter = 1 for state in mdp.getStates(): self.values[state] = mdp.getReward(state, 'Stop', state) while (currentIterationCounter != self.iterations): newValues = util.Counter() for state in mdp.getStates(): tempValues = util.Counter() for action in mdp.getPossibleActions(state): for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action): newState = newStateAndProb[0] prob = newStateAndProb[1] tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState]) newValues[state] = tempValues[tempValues.argMax()] currentIterationCounter += 1 for state in mdp.getStates(): self.values[state] = newValues[state]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 mdpStates = mdp.getStates() for iteration in xrange(iterations): newValues = util.Counter() for state in mdpStates: if self.mdp.isTerminal(state): continue actionValues = -sys.maxint - 1 for action in mdp.getPossibleActions(state): sum = 0 for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action): sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState]) if sum > actionValues: actionValues = sum newValues[state] = actionValues self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #while still iterations #for each state #for action in each state #get Q(state,action) #store largest (state,action) in Counter for i in range(self.iterations): newValues = self.values.copy() #WTF WHY THIS TOOK HOURS for state in mdp.getStates(): v = [float("-inf")] if not mdp.isTerminal(state): for action in mdp.getPossibleActions(state): v += [self.computeQValueFromValues(state,action)] newValues[state] = max(v) self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() for i in range(iterations): # running the alg on the indicated number of iterations y = self.values.copy() #V sub k-1 for state in mdp.getStates(): actions = util.Counter() if mdp.isTerminal(state) == False: for possibleActions in mdp.getPossibleActions(state): for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions): value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState])) actions[possibleActions] += value_iteration self.values[state] = actions[actions.argMax()]
def __init__(self, mdp, discount=0.9, iterations=100): self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 allStates = self.mdp.getStates() # Write value iteration code here for i in range(iterations): interState = util.Counter() for state in allStates: best = -9999999 actions = mdp.getPossibleActions(state) for action in actions: transitions = self.mdp.getTransitionStatesAndProbs(state, action) sumTransitions = 0 for transition in transitions: reward = self.mdp.getReward(state, action, transition[0]) sumTransitions += transition[1]*(reward + discount*self.values[transition[0]]) best = max(best, sumTransitions) if best != -9999999: interState[state] = best for state in allStates: self.values[state] = interState[state]
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" mdp = self.mdp possibleActions = mdp.getPossibleActions(state) maxActionValue = float('-inf') maxAction = None if ((possibleActions==None) or (mdp.isTerminal(state))): return None for action in possibleActions: actionSum = self.getQValue(state, action) #Find the maximum action if maxActionValue < actionSum: maxAction = action maxActionValue = actionSum return maxAction
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.qvalues = util.Counter() states = mdp.getStates() for i in range(self.iterations): valuesCopy = self.values.copy() for state in states: actions = mdp.getPossibleActions(state) q = [] for action in actions: q.append(self.getQValue(state,action)) if len(q) == 0: valuesCopy[state] = 0 else: valuesCopy[state] = max(q) self.values = valuesCopy
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ "*** YOUR CODE HERE ***" self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # value of each state; a Counter is a dict with default 0 # run for desired number of iterations for i in xrange(iterations): new_values = self.values.copy() for s in mdp.getStates(): if not mdp.isTerminal(s): # the commented code works as well #curr_best = float("-inf") #for a in mdp.getPossibleActions(s): #temp_value = sum([p * (mdp.getReward(s, a, s2) + discount*prev[s2]) for s2, p in mdp.getTransitionStatesAndProbs(s, a)]) # if temp_value > curr_best: # curr_best = temp_value #self.values[s] = curr_best new_values[s] = max([self.getQValue(s, a) for a in mdp.getPossibleActions(s)]) self.values = new_values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #states = mdp.getStates() #values = {state: 0 for state in states} for i in range(iterations): previous = self.values.copy() for state in mdp.getStates(): possibleActions = mdp.getPossibleActions(state) if len(possibleActions) == 0: continue results = [] for action in possibleActions: total = 0 for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action): total += (prob * previous[nextState]) results.append(total) self.values[state] = mdp.getReward(state) + (discount * max(results))
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(iterations): valuesNew = util.Counter() for state in mdp.getStates(): maxVal = -1 if not mdp.isTerminal(state): vals = util.Counter() for possact in mdp.getPossibleActions(state): #value = self.computeQValueFromValues(state, possact) #if value > maxVal: # maxVal = value vals[possact] = self.computeQValueFromValues(state, possact) #valuesNew[state] = maxVal valuesNew[state] = max(vals.values()) for st2 in valuesNew: self.values[st2] = valuesNew[st2]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 while self.iterations > 0: prev_values = self.values.copy() for state in mdp.getStates(): actions = mdp.getPossibleActions(state) if not actions: continue self.values[state] = max([sum([prob*(mdp.getReward(state, act, state1) + discount*prev_values[state1]) for state1, prob in mdp.getTransitionStatesAndProbs(state, act)]) for act in actions]) self.iterations -= 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.ValuesDup = util.Counter() # Write value iteration code here "*** YOUR CODE HERE ***" iterations = self.iterations while(iterations >0): for astate in mdp.getStates(): if mdp.isTerminal(astate)==0: QVallist=[] for action in mdp.getPossibleActions(astate): QVallist += [self.computeQValueFromValues(astate, action)] self.values[astate]=max(QVallist) for states,value in self.values.items(): self.ValuesDup[states] = self.values[states] iterations+=-1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" states = mdp.getStates() for k in range(iterations): newValues = {} for state in states: actions = mdp.getPossibleActions(state) v = util.Counter() for action in actions: v[action] = self.computeQValueFromValues(state, action) newValues[state] = v[v.argMax()] self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for times in range(iterations): V = self.values.copy() for state in mdp.getStates(): action_values = util.Counter() for action in mdp.getPossibleActions(state): for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action): action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state]) self.values[state] = action_values[action_values.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" state = mdp.getStartState() for i in range(0,iterations): #print "iteration: ", i #iterate once through all states and actions, save q-values for state in mdp.getStates(): for action in mdp.getPossibleActions(state): #compute qValue for each action qValue = self.getQValue(state, action) self.values[(state,action)] = qValue #after all qValues are computed, iterate againt through states, save value from optimal policy. these values will be V* for next iteration for state in mdp.getStates(): action = self.getAction(state) self.values[state] = self.values[(state, action)] """
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.vks = util.Counter() for i in range(0,iterations): self.vks = self.values.copy() st = mdp.getStates() for s in st: a = mdp.getPossibleActions(s) qvals = util.Counter() for action in a: qvals[action] = 0 stp = self.mdp.getTransitionStatesAndProbs(s,action) for ss, prob in stp: qvals[action] = qvals[action] + prob*(self.mdp.getReward(s,action,ss) + self.discount*(self.vks[ss])) self.values[s] = qvals[qvals.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" # Init : Not required # Value iteration for i in range(iterations): old_values = self.values.copy() for state in mdp.getStates(): value_state_action = [] for action in mdp.getPossibleActions(state): val = 0 transition = mdp.getTransitionStatesAndProbs(state,action) for sstate,prob_s_a_ss in transition: val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate]) value_state_action.append(val) if value_state_action : self.values[state] = max(value_state_action)
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for time in range(iterations): values = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): values[state] = 0 else: maxValue = -INF for action in mdp.getPossibleActions(state): maxValue = max(maxValue, self.getQValue(state, action)) values[state] = maxValue self.values = values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(iterations): nextValues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): continue first = True for action in mdp.getPossibleActions(state): qValue = 0 for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action): reward = mdp.getReward(state, action, nextState) qValue += prob * (reward + discount*self.values[nextState]) if first: maxQValue = qValue first = False elif qValue > maxQValue: maxQValue = qValue nextValues[state] = maxQValue self.values = nextValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" # OUR CODE HERE #Note: I think we should use the util.Counter thing? for times in range(0, iterations): #values from previous iteration so we don't update over them while iterating prevVals = self.values.copy() #iterate through all states for state in mdp.getStates(): #will store the action-value for the iteration value = util.Counter() for action in mdp.getPossibleActions(state): for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action): #expected value, probability * reward for the state with the discount * reward value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState]) #update the values to the new value from the iteration #the .argMax() function returns the one with the largest value self.values[state] = value[value.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Author - Shandheap Shanmuganathan """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default values as 0 self.count = 1 while self.count <= iterations: for state in mdp.getStates(): possibleActions = mdp.getPossibleActions(state) if len(possibleActions) == 0: continue QValues = {} for action in possibleActions: if action == "exit": finalScore = self.mdp.getReward(state, action, 'TERMINAL_STATE') self.values[state, self.count] = finalScore continue else: QValues[action] = self.getQValue(state, action) maxAction = None maxQ = -sys.maxint - 1 for key, value in QValues.iteritems(): if value > maxQ: maxAction = key maxQ = value if maxQ != -sys.maxint - 1: self.values[state, self.count] = maxQ self.count += 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here while self.iterations > 0: junk = self.values.copy() for state in self.mdp.getStates(): garbage = {} for action in mdp.getPossibleActions(state): garbage[action] = 0 for (nextState, prob) in self.mdp.getTransitionStatesAndProbs(state, action): garbage[action] += prob * (mdp.getReward(state, action, nextState) + self.discount * junk[nextState]) try: self.values[state] = max(garbage.values()) except ValueError: self.values[state] = 0 self.iterations -= 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for n in range(iterations): V = self.values.copy() for s in mdp.getStates(): action_values = [] for a in mdp.getPossibleActions(s): action_value = 0 for s_, P in mdp.getTransitionStatesAndProbs(s, a): action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_]) action_values.append(action_value) self.values[s] = max(action_values or [0])
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(iterations): lastValues = copy.deepcopy(self.values) for s in mdp.getStates(): aCounter = util.Counter() for a in mdp.getPossibleActions(s): for s2 in mdp.getStates(): aCounter[a] += self.T(s,a,s2) * (mdp.getReward(s,a,s2) + discount*lastValues[s2]) self.values[s] = aCounter[aCounter.argMax()]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here #Grab State from MDP State = self.mdp.getStates()[2] #Grab the NextState from TransitionStates based on the current state NextState = mdp.getTransitionStatesAndProbs( State, mdp.getPossibleActions(State)[0]) #List of all States States = self.mdp.getStates() #MDP States MDPStates = self.mdp.getStates() for i in range(0, self.iterations): TemporaryValue = util.Counter() for State in MDPStates: #Is the state a terminal state? if self.mdp.isTerminal(State): TemporaryValue[State] = 0 else: MaxPossibleValue = float("-inf") for Action in self.mdp.getPossibleActions(State): #Score is initially 0 Score = 0 for NextState, Probability in self.mdp.getTransitionStatesAndProbs( State, Action): #Bellman equation Score += Probability * ( self.mdp.getReward(State, Action, NextState) + (self.discount * self.values[NextState])) #Set max possible value to whichever is greater, old value or current score. MaxPossibleValue = max(Score, MaxPossibleValue) #Update the temporary value of the state (for next iteration) TemporaryValue[State] = MaxPossibleValue self.values = TemporaryValue
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.history = {} for state in mdp.getStates(): self.history[state] = [] for i in range(iterations + 1): for state in mdp.getStates(): if i == 0: self.history[state].append(0) continue if mdp.isTerminal(state): self.history[state].append(0) continue actions = mdp.getPossibleActions(state) if 'exit' in actions: self.history[state].append( mdp.getReward( state, 'exit', mdp.getTransitionStatesAndProbs(state, 'exit')[0][0])) continue max = -99999999 for action in actions: statesAndProbs = mdp.getTransitionStatesAndProbs( state, action) tempMax = 0 for (s, p) in statesAndProbs: tempMax += p * (mdp.getReward(state, action, s) + self.discount * self.history[s][i - 1]) if tempMax > max: max = tempMax self.history[state].append(max) #print history for key, value in self.history.iteritems(): #print (key, value) self.values[key] = value[iterations] print self.values #print self.values "*** YOUR CODE HERE ***"
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount print("using discount {}".format(discount)) self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.policies = util.Counter() # A Counter is a dict with default 0 delta = 0.01 # TODO: Implement Policy Iteration. # Exit either when the number of iterations is reached, # OR until convergence (L2 distance < delta). # Print the number of iterations to convergence. # To make the comparison FAIR, one iteration is a single sweep over states. # Compute the number of steps until policy convergence, but do not stop # the algorithm until values converge. #TODO # Init values for s in mdp.getStates(): self.values[s] = 0 if mdp.isTerminal(s): continue self.policies[s] = mdp.getPossibleActions(s)[0] state_iters = 0 # Iterations over state space until policy convergerce policy_iters = 0 # Iterations over algorithm until policy convergerce algo_iters = 0 def L2_norm(v1, v2): dist = 0 for k in v1.keys(): dist += (v1[k] - v2[k])**2 return dist**(1 / 2) policy_stable = False values_converged = False while not values_converged and algo_iters != iterations: # Policy Evaluation dist = delta while dist >= delta: old_values = self.values.copy() for s in mdp.getStates(): # Skip terminal state if mdp.isTerminal(s): continue v = self.values[s] new_v = 0 for s_n, p in mdp.getTransitionStatesAndProbs( s, self.policies[s]): new_v += p * (mdp.getReward(s, self.policies[s], s_n) + discount * self.values[s_n]) self.values[s] = new_v # Calculate the new distance dist = L2_norm(self.values, old_values) if not policy_stable: state_iters += 1 values_converged = True # Policy Improvement if not policy_stable: policy_iters += 1 state_iters += 1 policy_stable = True for s in mdp.getStates(): if mdp.isTerminal(s): continue old_action = self.policies[s] p_list = list() possible_actions = mdp.getPossibleActions(s) for a in possible_actions: v_sum = 0 for s_n, p in mdp.getTransitionStatesAndProbs(s, a): v_sum += p * (mdp.getReward(s, a, s_n) + discount * self.values[s_n]) p_list.append(v_sum) # Assign the maximum value to the current state self.policies[s] = possible_actions[np.argmax(p_list)] if old_action != self.policies[s]: policy_stable = False values_converged = False algo_iters += 1 print( f"Policy Iteration: {state_iters} iterations over the state space") print( f"Policy Iteration: {policy_iters} iterations until policy convergence" )
print("using discount {}".format(discount)) self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 delta = 0.01 <<<<<<< HEAD for iteration in range(self.iterations): temp_values = util.Counter() l2_distance = 0 for state in self.mdp.getStates(): value = -np.inf if mdp.isTerminal(state): temp_values[state] = 0 continue for action in mdp.getPossibleActions(state): list = mdp.getTransitionStatesAndProbs(state, action) tmp_value = 0 for pair in list: tmp_value += pair[1] * (mdp.getReward(state, action, pair[0]) + self.discount * self.values[pair[0]]) value = max(value, tmp_value) temp_values[state] = value l2_distance = max(l2_distance, np.linalg.norm(value - self.values[state])) if l2_distance < delta: print(iteration) break self.values = temp_values ======= >>>>>>> d00d5057d6ac8b04a3f737da09dbfae34a50aec3
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" max_err = 0.001 iteration_number = 1 #U1 = {s:0 for s in mdp.getStates() } U = util.Counter() while (iteration_number <= self.iterations): U = self.values.copy() delta = 0 for s in mdp.getStates(): T = [] if (self.mdp.isTerminal(s)): self.values[s] = 0 else: max_a = mdp.getPossibleActions(s)[0] max_sum = -999999999990 for a in mdp.getPossibleActions(s): sum_for_a = 0 # T will store a list of (nextState,prob) T = mdp.getTransitionStatesAndProbs(s, a) for pair in T: sum_for_a += pair[1] * U[pair[0]] if (max_sum < sum_for_a): max_sum = sum_for_a max_a = a self.values[s] = mdp.getReward( s, max_a, T[0]) + self.discount * max_sum delta = max(delta, abs(self.values[s] - U[s])) if (delta <= max_err * (1 - self.discount) / self.discount): for key in U: self.values[key] = U[key] break #last line inside while iteration_number += 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" #for state in mdp.getStates() # if Actions.exit in mdp.getPossibleActions(state): # self.values[state] = mdp.getReward(state, action, mdp.getTransitionStatesAndProbs(state, action)[0][0]) # WARNING: Assumes the last method call only returns a list of one (state, prob) pair #for iter in range(1, iterations): # oldValues = self.values # newValues = util.Counter() # for state in mdp.getStates(): # newValues[state] = max([(sum([stateAndProb[1]*(mdp.getReward(state, action, stateAndProb[0]) + self.discount * oldValues[stateAndProb[0]]) for stateAndProb in mdp.getTransitionStatesAndProbs(state, action)])), action) for action in mdp.getPossibleActions(state)], key=lambda x: x[0]) # self.values = newValues self.qValues = util.Counter() #To hold Q-values #FIXME Eliminate magic numbers for state in mdp.getStates(): if 'exit' in mdp.getPossibleActions(state): self.values[state] = mdp.getReward( state, 'exit', mdp.getTransitionStatesAndProbs(state, 'exit')[0][0]) for iter in range(1, iterations): #print "Iter:", iter oldValues, newValues = self.values, util.Counter() oldQValues, newQValues = self.qValues, util.Counter() for state in [ state1 for state1 in self.mdp.getStates() if str(state1) != "TERMINAL_STATE" ]: #print "State:", state valueActionPairs = [(sum([ stateAndProb[1] * (self.mdp.getReward(state, action, stateAndProb[0]) + self.discount * oldValues[stateAndProb[0]]) for stateAndProb in self.mdp.getTransitionStatesAndProbs( state, action) ]), action) for action in mdp.getPossibleActions(state)] for action in mdp.getPossibleActions(state): newQValues[(state, action)] = [ valueActionPair[0] for valueActionPair in valueActionPairs if valueActionPair[1] == action ][0] #FIXME Assumes there is only one newValues[state] = max(valueActionPairs, key=lambda x: x[0])[0] self.values, self.qValues = newValues, newQValues
display = graphicsGridworldDisplay.GraphicsGridworldDisplay(mdp, opts.gridSize, opts.speed) display.start() ########################### # GET THE AGENT ########################### import valueIterationAgents, qlearningAgents a = None if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) gridWorldEnv = GridworldEnvironment(mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = {'gamma': opts.discount, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'actionFn': actionFn} a = qlearningAgents.QLearningAgent(**qLearnOpts) elif opts.agent == 'random': # # No reason to use the random agent without episodes if opts.episodes == 0: opts.episodes = 10 class RandomAgent: def getAction(self, state): return random.choice(mdp.getPossibleActions(state)) def getValue(self, state): return 0.0 def getQValue(self, state, action):
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # a lot like doing expecti-max recurance for some horizon # start with v0 = 0, as no time left means no reward # given vk, do one ply of expectimax to get vk+1 # repeat until converged states = mdp.getStates() # print(states) # actions = mdp.getPossibleActions(states[2]) # print(actions) # tranprob = mdp.getTransitionStatesAndProbs(states[2], actions[1]) # print(tranprob) # reward = mdp.getReward(states[2], actions[1], states[2]) # print(reward) # term = mdp.isTerminal(states[0]) # print(term) # for all states initally zero # for the depth of horizon for k in range(self.iterations): # Use the util.Counter class in util.py, which is a dictionary with a default value of zero. # Every iteration updates the values and (implicitly) the policy # self.values is the acumulator, meaning that at each step it can be looked to to get the vk-1 info # for each state, vk is the max over its actions + the time penelized vk-1 # "Use the "batch" version of value iteration where each vector Vk is computed from a fixed vector Vk-1 (like in lecture)" # I think this is what you want? not undated in place? vk = util.Counter() # for each state, get the max sum over the actions for s in states: act = mdp.getPossibleActions(s) maxk = -99E99 for a in act: # find the expected value at that step based on probs staProb = mdp.getTransitionStatesAndProbs(s, a) # print(staProb) # list of tuples (state, prob), we would want to fold as prob*reward(stateNow, a, Prob's state) + # g*valueNow(s) # print([0]+staProb) sum2 = functools.reduce( lambda x, y: x + y[1] * (mdp.getReward(s, a, y[ 0]) + self.discount * self.values[y[0]]), [0] + staProb) maxk = max(maxk, sum2) # if the value has not updated/ no actions Make sure to handle the case when a state has no available # actions in an MDP (think about what this means for future rewards). if len(act ) != 0: # no actions to take, so stay 0 in the counter vk[s] = maxk # after that iterate is done, vk becomes the new vk-1 self.values = util.Counter.copy(vk) # so, at the end of iterations vk is now vk-1 return
def actionFn(state): return mdp.getPossibleActions(state)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" # for storing the policy: self.policy = util.Counter() # for storing the Q values: self.Q_values = util.Counter() all_states = mdp.getStates() for i in range(iterations): new_values = util.Counter() for state in all_states: if mdp.isTerminal(state): continue actions = mdp.getPossibleActions(state) max = -2000000000 for action in actions: sum = 0 states = mdp.getTransitionStatesAndProbs(state, action) for state_pair in states: sum += state_pair[1] * ( mdp.getReward(state, action, state_pair[0]) + discount * self.values[state_pair[0]]) if sum > max: max = sum new_values[state] = max self.values = new_values # Calculate best policy for state in all_states: if mdp.isTerminal(state): continue best = None max = -2000000000 actions = mdp.getPossibleActions(state) for action in actions: new_states = mdp.getTransitionStatesAndProbs(state, action) sum = 0 for state_pair in new_states: sum += state_pair[1] * self.values[state_pair[0]] if sum > max: max = sum best = action self.policy[state] = best # Calculate all Q Values (Q Value iteration) for i in range(iterations): new_q_values = util.Counter() for state in all_states: if mdp.isTerminal(state): continue actions = mdp.getPossibleActions(state) for action in actions: states = mdp.getTransitionStatesAndProbs(state, action) sum = 0 for state_pair in states: new_actions = mdp.getPossibleActions(state_pair[0]) max_action = -2000000000 if len(new_actions) == 0: max_action = 0 for a in new_actions: v = self.Q_values[(state_pair[0], a)] if v > max_action: max_action = v sum += state_pair[1] * ( mdp.getReward(state, action, state_pair[0]) + discount * max_action) new_q_values[(state, action)] = sum self.Q_values = new_q_values
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.policy = util.Counter() self.q_value = util.Counter() for k in range(iterations): previous_values = self.values.copy() previous_q = self.q_value.copy() for state in mdp.getStates(): possible_actions = mdp.getPossibleActions(state) max_val = -float("inf") for action in possible_actions: prob = mdp.getTransitionStatesAndProbs(state, action) sum = 0 q_val = 0 for nextState in prob: reward = mdp.getReward(state, action, nextState[0]) previous = discount * previous_values[nextState[0]] sum += (reward + previous) * nextState[1] list_q = [] possible_actions2 = self.mdp.getPossibleActions( nextState[0]) for q_act in possible_actions2: list_q.append(previous_q[(nextState, q_act)]) if len(list_q) == 0: previous_q_val = 0 else: previous_q_val = discount * max(list_q) q_val = (reward + previous_q_val) * nextState[1] if max_val < sum: max_val = sum self.q_value[(state, action)] = q_val self.values[state] = max_val for state in mdp.getStates(): list = util.Counter() possible_actions = mdp.getPossibleActions(state) for action in possible_actions: prob = mdp.getTransitionStatesAndProbs(state, action) sum = 0 for nextState in prob: reward = mdp.getReward(state, action, nextState[0]) previous = discount * self.values[nextState[0]] sum += (reward + previous) * nextState[1] list[action] = sum self.policy[state] = list.argMax()
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Loop according to the supplied number of iterations. for iteration in range(self.iterations): # Make a copy of the values to accommodate pass by value. val = self.values.copy() # Get all the states in the mdp. states = mdp.getStates() for state in states: # Get all the possible actions for the current state. actions = mdp.getPossibleActions(state) if mdp.isTerminal(state) == False: max = -99999 for action in actions: v = 0 # Get all the possible transitions for each the state and action. transitions = mdp.getTransitionStatesAndProbs( state, action) for transition in transitions: # Perform value iteration. v = v \ + transition[1] \ * (mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]]) if v > max: max = v val[state] = max else: for action in actions: v = 0 # Get all the possible transitions for each the state and action. transitions = mdp.getTransitionStatesAndProbs( state, action) for transition in transitions: # Perform value iteration. v = v \ + transition[1] \ * (mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]]) val[state] = v self.values = val
def getAction(self, state): return random.choice(mdp.getPossibleActions(state))
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" """ print mdp.getStates() for state in mdp.getStates(): print state actions = mdp.getPossibleActions(state) for action in actions: print action,mdp.getTransitionStatesAndProbs(state,action) """ for i in range(iterations): valuesHolder = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): valuesHolder[state] = mdp.getReward(state,'exit','') else: valuesHolder[state] = max([self.computeQValueFromValues(state,action) for action in mdp.getPossibleActions(state)]) self.values = valuesHolder
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount print("using discount {}".format(discount)) self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 delta = 0.01 self.policy = dict() total_iterations = 0 for _ in range(self.iterations): for iteration in range(self.iterations): # policy evaluation temp_values = util.Counter() l2_distance = 0 for state in self.mdp.getStates(): if state not in self.policy: # initialize random policy if not self.mdp.getPossibleActions(state): self.policy[state] = None else: self.policy[state] = np.random.choice( mdp.getPossibleActions(state)) if mdp.isTerminal(state): temp_values[state] = 0 continue list = mdp.getTransitionStatesAndProbs( state, self.policy[state]) value = 0 for pair in list: value += pair[1] * ( mdp.getReward(state, self.policy[state], pair[0]) + self.discount * self.values[pair[0]]) temp_values[state] = value l2_distance = max( l2_distance, np.linalg.norm(value - self.values[state])) total_iterations += 1 self.values = temp_values if l2_distance < delta: break policy_converged = True for state in self.mdp.getStates(): # policy improvement if mdp.isTerminal(state): continue current_value = self.computeQValueFromValues( state, self.policy[state]) current_action = self.policy[state] for action in self.mdp.getPossibleActions(state): if self.computeQValueFromValues(state, action) > current_value: # print(current_action, current_value, action, self.computeQValueFromValues(state, action)) current_value = self.computeQValueFromValues( state, action) current_action = action policy_converged = False self.policy[state] = current_action if policy_converged: print(total_iterations) break
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for s in mdp.getStates(): self.values[s] = 0 "for a in mdp.getPossibleActions(s):" "for ac in mdp.getTransitionStatesAndProbs(s,a):" " print ac[0]" "print ac[1]" "copy_value = self.values.copy()" "for c in mdp.getStates():" " print copy_value[c]" i=0 "self.states = mdp.getStates()" while i < iterations: copy_value = self.values.copy() for s in mdp.getStates(): if not mdp.isTerminal(s): self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)]) i = i + 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. """ ''' #Some useful mdp methods you will use: States Action Ttransition&prob Reward Y Horizon - isTerminal Find optimal policy print(mdp.getStates()) # ['TERMINAL_STATE', (0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0), (3, 1), (3, 2)] hello = mdp.getStates()[10] print(hello) print(mdp.getPossibleActions(hello)) #('north', 'west', 'south', 'east') mdp.getTransitionStatesAndProbs(hello, 'south') #mdp.getTransitionStatesAndProbs(hello, 'south') = [((1, 2), 0.8), ((0, 2), 0.1), ((2, 2), 0.1)] print(mdp.getReward(hello, 'south', (3,0))) #gives you the probability print(mdp.isTerminal(hello)) ''' self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here # "*** YOUR CODE HERE ***" # state = mdp.getStates() # states = state[1:] # print values #value = [] x = 0 while (x < iterations): random = self.values.copy() # actions = mdp.getPossibleActions() states = mdp.getStates() for i in states: # print i # if(mdp.isTerminal(i)==True): its already initialized with zero if (mdp.isTerminal(i) == False): summ = 0 maxx_qval = -100000 directions = mdp.getPossibleActions(i) print directions for j in directions: summ = 0 trans = mdp.getTransitionStatesAndProbs(i, j) # Transition probability for k in trans: # print k # print k[0] #k[1] - probability myreward = mdp.getReward(i, j, k[0]) #reward # print myreward summ = summ + (( (discount * self.values[k[0]]) + myreward) * k[1]) qvalue = summ if (qvalue > maxx_qval): maxx_qval = qvalue random[i] = maxx_qval self.values = random x = x + 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here # need to find new utilites for all states in S states = mdp.getStates() # update the values for all iterations for i in range(iterations): # keep track of new values newValues = util.Counter() # for each state find the new utility for s in states: # if we're at a terminal state if mdp.isTerminal(s): newValues[s] = self.mdp.getReward(s, None, None) else: #find possible actions actions = mdp.getPossibleActions(s) # as long as there are actions to take #if(not mdp.isTerminal(s)): if len(actions) != 0: # update utilities by finding the maximum q value # find qValues based on the actions qValues = [] for a in actions: transitionStates = self.mdp.getTransitionStatesAndProbs( s, a) sumOfTransitions = 0.0 for ts in transitionStates: nextState = ts[0] prob = ts[1] # value for this state * the probability of getting there sumOfTransitions += prob * self.getValue( nextState) #sumOfTransitions += prob * (self.mdp.getReward(s, a, nextState) + (self.discount * self.getValue(nextState))) # current reward + the discount factor * the sum over all of the transition states qValue = self.mdp.getReward( s, None, None) + self.discount * sumOfTransitions # qValue = sumOfTransitions qValues.append(qValue) maxQValue = max(qValues) newValues[s] = maxQValue self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount print("using discount {}".format(discount)) self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.policies = util.Counter() delta = 0.01 iteration = 1 # initialize the policies arbitrarily for state in mdp.getStates(): actions = mdp.getPossibleActions(state) if len(actions) >= 1: self.policies[state] = mdp.getPossibleActions(state)[0] else: self.policies[state] = None policy_loop = 0 while True: policy_loop += 1 while True: # policy evaluation iteration += 1 difference = 0 for state in mdp.getStates(): old_value = self.values[state] action = self.policies[state] if action is None: continue self.values[state] = self.computeQValueFromValues(state, action) difference = max(difference, abs(old_value-self.values[state])) if difference < delta or iteration == iterations: break if iteration == iterations: break # policy imporvement stable = True iteration += 1 for state in mdp.getStates(): old_policy = self.policies[state] self.policies[state] = self.computeActionFromValues(state) if old_policy != self.policies[state]: stable = False if stable or iteration == iterations: break print("It took a total of {} iterations to converge.".format(iteration)) print("It took a total of {} total policy iteration loops to converge.".format(policy_loop))
mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here for i in range(iterations): storeValues = util.Counter() states = mdp.getStates() for s in states: actions = mdp.getPossibleActions(s) if(len(actions) == 0): continue qVals = [self.getQValue(s, a) for a in actions] storeValues[s] = max(qVals) self.values = storeValues def getValue(self, state): """ Return the value of the state (computed in __init__). """ return self.values[state] def computeQValueFromValues(self, state, action): """ Compute the Q-value of action in state from the
import graphicsGridworldDisplay display = graphicsGridworldDisplay.GraphicsGridworldDisplay( mdp, opts.gridSize, opts.speed) display.start() ########################### # GET THE AGENT ########################### import valueIterationAgents, qlearningAgents, sarsaLambdaAgents a = None if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'valueApproximate': actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = { 'gamma': opts.discount, 'iterations': opts.iters, 'mdp': mdp, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'extractor': opts.extractor, 'actionFn': actionFn } a = valueIterationAgents.ApproximateValueIterAgent(**qLearnOpts) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = {
def main(myargs): sys.argv = myargs.split() opts = parseOptions() ########################### # GET THE GRIDWORLD ########################### if opts.grid == 'VerticalBridgeGrid': opts.gridSize = 120 import gridworld mdpFunction = getattr(gridworld, "get" + opts.grid) mdp = mdpFunction() mdp.setLivingReward(opts.livingReward) mdp.setNoise(opts.noise) env = gridworld.GridworldEnvironment(mdp) ########################### # GET THE DISPLAY ADAPTER ########################### import textGridworldDisplay display = textGridworldDisplay.TextGridworldDisplay(mdp) if not opts.textDisplay: import graphicsGridworldDisplay display = graphicsGridworldDisplay.GraphicsGridworldDisplay( mdp, opts.gridSize, opts.speed) try: display.start() except KeyboardInterrupt: sys.exit(0) ########################### # GET THE AGENT ########################### import valueIterationAgents, qlearningAgents a = None if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) gridWorldEnv = GridworldEnvironment(mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = { 'gamma': opts.discount, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'actionFn': actionFn } a = qlearningAgents.QLearningAgent(**qLearnOpts) elif opts.agent == 'random': # # No reason to use the random agent without episodes if opts.episodes == 0: opts.episodes = 10 class RandomAgent: def getAction(self, state): return random.choice(mdp.getPossibleActions(state)) def getValue(self, state): return 0.0 def getQValue(self, state, action): return 0.0 def getPolicy(self, state): "NOTE: 'random' is a special policy value; don't use it in your code." return 'random' def update(self, state, action, nextState, reward): pass a = RandomAgent() else: if not opts.manual: raise 'Unknown agent type: ' + opts.agent ########################### # RUN EPISODES ########################### # DISPLAY Q/V VALUES BEFORE SIMULATION OF EPISODES try: if not opts.manual and opts.agent == 'value': if opts.valueSteps: for i in range(opts.iters): tempAgent = valueIterationAgents.ValueIterationAgent( mdp, opts.discount, i) display.displayValues(tempAgent, message="VALUES AFTER " + str(i) + " ITERATIONS") display.pause() display.displayValues(a, message="VALUES AFTER " + str(opts.iters) + " ITERATIONS") display.pause() display.displayQValues(a, message="Q-VALUES AFTER " + str(opts.iters) + " ITERATIONS") display.pause() except KeyboardInterrupt: sys.exit(0) # FIGURE OUT WHAT TO DISPLAY EACH TIME STEP (IF ANYTHING) displayCallback = lambda x: None if not opts.quiet: if opts.manual and opts.agent == None: displayCallback = lambda state: display.displayNullValues(state) else: if opts.agent == 'random': displayCallback = lambda state: display.displayValues( a, state, "CURRENT VALUES") if opts.agent == 'value': displayCallback = lambda state: display.displayValues( a, state, "CURRENT VALUES") if opts.agent == 'q': displayCallback = lambda state: display.displayQValues( a, state, "CURRENT Q-VALUES") messageCallback = lambda x: printString(x) if opts.quiet: messageCallback = lambda x: None # FIGURE OUT WHETHER TO WAIT FOR A KEY PRESS AFTER EACH TIME STEP pauseCallback = lambda: None if opts.pause: pauseCallback = lambda: display.pause() # FIGURE OUT WHETHER THE USER WANTS MANUAL CONTROL (FOR DEBUGGING AND DEMOS) if opts.manual: decisionCallback = lambda state: getUserAction(state, mdp. getPossibleActions) else: decisionCallback = a.getAction # RUN EPISODES if opts.episodes > 0: print() print("RUNNING", opts.episodes, "EPISODES") print() returns = 0 for episode in range(1, opts.episodes + 1): returns += runEpisode(a, env, opts.discount, decisionCallback, displayCallback, messageCallback, pauseCallback, episode) if opts.episodes > 0: print() print("AVERAGE RETURNS FROM START STATE: " + str((returns + 0.0) / opts.episodes)) print() print() # DISPLAY POST-LEARNING VALUES / Q-VALUES if opts.agent == 'q' and not opts.manual: try: display.displayQValues(a, message="Q-VALUES AFTER " + str(opts.episodes) + " EPISODES") display.pause() display.displayValues(a, message="VALUES AFTER " + str(opts.episodes) + " EPISODES") display.pause() except KeyboardInterrupt: sys.exit(0)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" def computeValue(mdp, state, iterationCount): if mdp.isTerminal(state): # self.values[state] = 0 return self.values[state] # if self.values[state] != 0: # return self.values[state] iterationCount -= 1 if iterationCount < -1: return self.values[state] actions = mdp.getPossibleActions(state) maxExpect = -9999 for action in actions: expect = 0.0 transactionAndProb = mdp.getTransitionStatesAndProbs( state, action) for element in transactionAndProb: # print element nextState = element[0] nextIteration = iterationCount expect += element[1] * ( mdp.getReward(state, action, nextState) + computeValue(mdp, nextState, nextIteration) * discount) if expect > maxExpect: maxExpect = expect self.values[state] = maxExpect return maxExpect allStates = mdp.getStates() for i in range(self.iterations): # print "iteration number", i temp = util.Counter() for state in allStates: if self.mdp.isTerminal(state): continue maxExpect = -9999 for action in mdp.getPossibleActions(state): expect = self.computeQValueFromValues(state, action) if expect > maxExpect: maxExpect = expect temp[state] = maxExpect self.values = temp
def runValueIteration(self): "*** YOUR CODE HERE ***" mdp = self.mdp values = self.values discount = self.discount iterations = self.iterations theta = self.theta states = mdp.getStates() predecessors = {} # dict for state in states: predecessors[state] = set() pq = util.PriorityQueue() # computes predecessors and puts initial stuff into pq for state in states: Q_s = util.Counter() for action in mdp.getPossibleActions(state): # assigning predecessors T = mdp.getTransitionStatesAndProbs(state, action) for (nextState, prob) in T: if prob != 0: predecessors[nextState].add(state) # computing Q values for determining diff's for the pq Q_s[action] = self.computeQValueFromValues(state, action) if not mdp.isTerminal(state): # means: if non terminal state maxQ_s = Q_s[Q_s.argMax()] diff = abs(values[state] - maxQ_s) pq.update(state, -diff) # now for the actual iterations for i in xrange(iterations): if pq.isEmpty(): return state = pq.pop() if not mdp.isTerminal(state): Q_s = util.Counter() for action in mdp.getPossibleActions(state): Q_s[action] = self.computeQValueFromValues(state, action) values[state] = Q_s[Q_s.argMax()] for p in predecessors[state]: Q_p = util.Counter() for action in mdp.getPossibleActions(p): # computing Q values for determining diff's for the pq Q_p[action] = self.computeQValueFromValues(p, action) #if not mdp.isTerminal(state): # means: if non terminal state maxQ_p = Q_p[Q_p.argMax()] diff = abs(values[p] - maxQ_p) if diff > theta: pq.update(p, -diff)
def update_state(self, mdp, state, vk): Q = util.Counter() for action in mdp.getPossibleActions(state): Q[action] = self.computeQValueFromValues(state, action) vk[state] = Q[Q.argMax()]
import graphicsGridworldDisplay display = graphicsGridworldDisplay.GraphicsGridworldDisplay(mdp, opts.gridSize, opts.speed) try: display.start() except KeyboardInterrupt: sys.exit(0) ########################### # GET THE AGENT ########################### #import valueIterationAgents, rtdp #, qlearningAgents import sarsa_agents, tree_backup, q_sigma a = None if opts.agent == 'n_step_sarsa': a = sarsa_agents.NStepSarsaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state)) elif opts.agent == 'n_step_expected_sarsa': a = sarsa_agents.NStepExpectedSarsaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state)) elif opts.agent == 'tree_backup': a = tree_backup.NStepTreeBackupAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state)) elif opts.agent == 'qsigma': a = q_sigma.QSigmaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state), sigma=opts.sigma, numEpisodes=opts.episodes) elif opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, env, opts.discount, opts.iters, display) elif opts.agent == 'valuegs': a = valueIterationAgents.GSValueIterationAgent(mdp, env, opts.discount, opts.iters, display) elif opts.agent == 'rtdp': a = rtdp.RTDPLearningAgent(mdp, env, opts.discount, opts.iters) #mdp, env, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp)