def main(): size,obstacles,car_start,car_end=ReadInput("input.txt") transitionMatrix=tm.GenerateTransitionMatrix(size) rewardMatrix = np.full((size,size),-1.0) for a in obstacles: rewardMatrix[a]=-101.0 transitionMatrix=tm.GenerateTransitionMatrix(size) utilityMatrix,optimalPolicy=vi.GetOptimalPolicy(size,transitionMatrix,rewardMatrix.copy(),car_end) rewardMatrix[car_end]+=100 utility=np.zeros((size,size)) countMatrix = np.full((size,size), 1.0e-10) gamma=0.9 for j in range(10000): episode=list() pos = tuple([np.random.randint(0, size), np.random.randint(0, size)]) #pos=car_start dest=car_end np.random.seed(j) swerve = np.random.random_sample(10000000) k=0 while pos!=dest: move = optimalPolicy[pos] if swerve[k] > 0.7: if swerve[k] > 0.8: if swerve[k] > 0.9: move = Turn_Left(Turn_Left(move)) else: move = Turn_Right(move) else: move = Turn_Left(move) k+=1 pos=UpdatePos(size,pos,move) episode.append((pos,rewardMatrix[pos])) visited = dict() i = 0 for visit in episode: state= visit[0] if state not in visited: returns= GetReturns(episode[i:],gamma) utility[state]+=returns countMatrix[state]+=1 visited[state]=1 i+=1 utility/=countMatrix print("DP Utility:") print(utilityMatrix) print("MP-Passive") print(utility)
def main(): size,obstacles,car_start,car_end=ReadInput("input.txt") transitionMatrix=tm.GenerateTransitionMatrix(size) rewardMatrix = np.full((size,size),-1.0) for a in obstacles: rewardMatrix[a]=-101.0 transitionMatrix=tm.GenerateTransitionMatrix(size) utilityMatrix,optimalPolicy=vi.GetOptimalPolicy(size,transitionMatrix,rewardMatrix.copy(),car_end) utility=np.zeros((size,size)) gamma=0.9 alpha=0.1 Lambda=0.5 rewardMatrix[car_end]+=100 utility[car_end]=rewardMatrix[car_end] #generates episodes and update online for j in range(4000): pos = tuple([np.random.randint(0, size), np.random.randint(0, size)]) #pos=car_start dest=car_end np.random.seed(j) swerve = np.random.random_sample(10000000) k=0 traceMatrix=np.zeros((size,size)) while pos!=dest: move = optimalPolicy[pos] if swerve[k] > 0.7: if swerve[k] > 0.8: if swerve[k] > 0.9: move = Turn_Left(Turn_Left(move)) else: move = Turn_Right(move) else: move = Turn_Left(move) k+=1 newpos=UpdatePos(size,pos,move) delta=rewardMatrix[pos]+gamma*utility[newpos]-utility[pos] traceMatrix[pos]+=1 utility=updateUtility(utility, traceMatrix,alpha,delta) traceMatrix=updateTraceMatrix(traceMatrix,gamma,Lambda) pos=newpos print("Dynamic Programming Utility:") print(utilityMatrix) print("TDlambda Utility:") print(utility)
def main(): size, obstacles, car_start, car_end = ReadInput("input.txt") transitionMatrix = tm.GenerateTransitionMatrix(size) rewardMatrix = np.full((size, size), -1.0) for a in obstacles: rewardMatrix[a] = -101.0 transitionMatrix = tm.GenerateTransitionMatrix(size) utilityMatrix, optimalPolicy = vi.GetOptimalPolicy(size, transitionMatrix, rewardMatrix.copy(), car_end) policy = np.random.randint(low=0, high=4, size=(size, size)).astype(np.int32) policy[car_end] = -1 rewardMatrix[car_end] += 100 stateActionMatrix = np.zeros((4, size, size)) #stateActionMatrix[0,car_end[0],car_end[1]]=stateActionMatrix[1,car_end[0],car_end[1]]=stateActionMatrix[2,car_end[0],car_end[1]]=stateActionMatrix[3,car_end[0],car_end[1]]=rewardMatrix[car_end] gamma = 0.9 countMatrix = np.full((4, size, size), 1.0e-10) for j in range(1000): episode = list() pos = tuple([np.random.randint(0, size), np.random.randint(0, size)]) #pos=car_start dest = car_end np.random.seed(j) swerve = np.random.random_sample(10000000) k = 0 starting = True while pos != dest: move = policy[pos] action = policy[pos] if starting: action = np.random.randint(0, 4) starting = False if swerve[k] > 0.7: if swerve[k] > 0.8: if swerve[k] > 0.9: move = Turn_Left(Turn_Left(move)) else: move = Turn_Right(move) else: move = Turn_Left(move) k += 1 newpos = UpdatePos(size, pos, move) episode.append((pos, action, rewardMatrix[pos])) pos = newpos episode.append((pos, 0, rewardMatrix[pos])) visited = dict() i = 0 for visit in episode: state = visit[0] if (visit[1], state[0], state[1]) not in visited: returns = GetReturns(episode[i:], gamma) stateActionMatrix[visit[1], state[0], state[1]] += returns countMatrix[visit[1], state[0], state[1]] += 1 visited[visit[1], state[0], state[1]] = 1 policy = UpdatePolicy(episode, policy, stateActionMatrix / countMatrix) print("DP Utility:") print(utilityMatrix) print("MP-Passive") print(stateActionMatrix / countMatrix) print("optimal policy") print(optimalPolicy) print("learned policy") print(policy)
def main(): size, obstacles, car_start, car_end = ReadInput("input.txt") transitionMatrix = tm.GenerateTransitionMatrix(size) rewardMatrix = np.full((size, size), -1.0) for a in obstacles: rewardMatrix[a] = -101.0 transitionMatrix = tm.GenerateTransitionMatrix(size) utilityMatrix, optimalPolicy = vi.GetOptimalPolicy(size, transitionMatrix, rewardMatrix.copy(), car_end) policy = np.random.randint(low=0, high=4, size=(size, size)).astype(np.int32) policy[car_end] = 0 rewardMatrix[car_end] += 100 stateActionMatrix = np.zeros((4, size, size)) stateActionMatrix[0, car_end[0], car_end[1]] = stateActionMatrix[ 1, car_end[0], car_end[1]] = stateActionMatrix[ 2, car_end[0], car_end[1]] = stateActionMatrix[3, car_end[0], car_end[1]] = rewardMatrix[car_end] gamma = 0.9 alpha = 0.1 #generates episodes and update online for j in range(4000): pos = tuple([np.random.randint(0, size), np.random.randint(0, size)]) #pos=car_start dest = car_end np.random.seed(j) swerve = np.random.random_sample(10000000) k = 0 while pos != dest: action = policy[pos] move = policy[pos] if swerve[k] > 0.7: if swerve[k] > 0.8: if swerve[k] > 0.9: move = Turn_Left(Turn_Left(move)) else: move = Turn_Right(move) else: move = Turn_Left(move) k += 1 newpos = UpdatePos(size, pos, move) nextaction = policy[newpos] stateActionMatrix = updateStateActionMatrix( stateActionMatrix, pos, newpos, action, nextaction, rewardMatrix[pos], alpha, gamma) policy = updatePolicy(policy, stateActionMatrix, pos) pos = newpos print("Dynamic Programming Utility:") print(utilityMatrix) print("Sarsa Utility:") print(stateActionMatrix) print("optimal policy") print(optimalPolicy) print("learned policy") print(policy)