def valueiteratingpolicy(n): v = initvalact.initvalact(n) value = v[0] action = v[1] reward = rewardsegregation.rewardsegregation(n, p, p1, encoder, ENClast) print reward test = ValueIteration.valueiteration(value, reward, action) policy = test[1] print policy raw = 0 col = 0 gotopos.gotopos(raw, col, p, p1, n) # 0 = up / 1 = down / 2 = left / 3= right global val1 val1 = pinSetup.valueRead_ON() while True and val1 == 0: if action[raw][col] == 0: act.playAction(0, raw, col, n, p, p1) raw = raw - 1 elif action[raw][col] == 1: act.playAction(1, raw, col, n, p, p1) raw = raw + 1 elif action[raw][col] == 2: act.playAction(2, raw, col, n, p, p1) col = col - 1 elif action[raw][col] == 3: act.playAction(3, raw, col, n, p, p1) col = col + 1 val1 = pinSetup.valueRead_ON() if val1 == 1: print "Stop"
def rewardsegregation(n, p, p1, encoder, ENClast): reward = generate_rewardmatrix.generate_rewardmatrix(n) for raw in range(0, n): for col in range(0, n): for action in action_select(raw, col, n): gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.5) ENClast = encoder.getData() action_20.playAction(action, raw, col, n, p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() reward[raw][col][action] = ENC - ENClast return reward
def rewardsegregation(n, p, p1, encoder, ENClast): reward = generate_rewardmatrix.generate_rewardmatrix(n) for raw in range(0, n): for col in range(0, n): for action in action_select(raw, col, n): gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.3) ENClast = encoder.getData() act.playAction(action, raw, col, n, p, p1) time.sleep(t) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() direction = pinSetup.valueRead_dir() print((-1)**direction) * (ENC - ENClast) reward[raw][col][action] = ((-1)**direction) * (ENC - ENClast) time.sleep(0.05) time.sleep(0.1) time.sleep(0.1) return reward
def qLearning(n, p, p1, encoder, ENClast): import qinitial v = initvalact.initvalact(n) Q = qinitial.qinitial(n) #Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] a = v[1] # a = [[None, None, None], [None, None, None], [None, None, None]] # initializing action matrix # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy(Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) while qError(Q, Qlast) > 1.5 or Q == Qlast: # check for the error value to be 10**-3 or Q = Qlast # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0 # and this will cause us to fall out of the loop iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size state = random.randint(1, size[0] * size[1]) # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) # ipdb.set_trace() for i in range(0, 20): # action selection according to selction of state if raw == 0 and col == 0: action = random.choice([1, 3]) elif raw == 0 and (col == -1 or col == size[1]-1): action = random.choice([1, 2]) elif raw == 0: action = random.choice([1, 2, 3]) elif raw == size[0]-1 and col == 0: action = random.choice([0, 3]) elif raw == size[0]-1 and (col == -1 or col == size[1]-1): action = random.choice([0, 2]) elif raw == size[0]-1: action = random.choice([0, 2, 3]) elif col == 0: action = random.choice([0, 1, 3]) elif (col == -1 or col == size[1]-1): action = random.choice([0, 1, 2]) else: action = random.randint(0, 3) # cells where all four actions are possible # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw-1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw+1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col-1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col+1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() reward[raw][col][action] = ENC - ENClast # update_reward.update_reward(reward, raw, col, action, diff) try: Q[raw][col][action] = reward[raw][col][action] + gama * (max(nextstate)) #print "Q", Q # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") raw = rawtemp col = coltemp print "qerror is", qError(Q,Qlast) print "reward is", reward # getting the appropriate action back from the given calculated values of Q matrix for r in range(0, size[0]): for c in range(0, size[1]): # ipdb.set_trace() a[r][c] = Q[r][c].index(max(Q[r][c])) # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration return Q, a, iteration
def qLearning(n, p, p1, encoder, ENClast): epsilon = 0.7 v = initvalact.initvalact(n) Q = qinitial.qinitial(n) '''Q = [[[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], # State1,State2, Stete3 [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], # State4, State5, State6 [[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]] # State7, State8, State9''' kMatrix = qinitial.qinitial(n) Tr = qinitial.qinitial(n) a = v[1] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy( Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) state = random.randint(1, size[0] * size[1]) while qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 100: iteration += 1 # incresing iteration value if iteration > 60: epsilon = 0.1 Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 if iteration == 1: temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) possibleActions = action_select(raw, col, n) probablity = epsilon_greedy_policy(Q[raw][col], possibleActions, epsilon) actionIndex = np.random.choice(len(probablity), p=probablity) action = possibleActions[actionIndex] # action selection according to selction of state ''' ------------------------------------------------------------- ***************REPLACED BY EPSILON GREDDY POLICY************* ------------------------------------------------------------- if i < NumOfSelAct: possibleActions = action_select(raw, col, n) tempList = [] for i in possibleActions: tempList.append(Q[raw][col][i]) action = possibleActions[tempList.index(max(tempList))] else: possibleActions = action_select(raw, col, n) action = random.choice(possibleActions) ------------------------------------------------------------- ***************REPLACED BY EPSILON GREDDY POLICY************* ------------------------------------------------------------- ''' # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw - 1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw + 1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col - 1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement nextstate = Q[raw][col + 1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast oldreward = reward[raw][col][action] if (oldreward != 0 and diff == 0) or (np.sign(oldreward) != np.sign(diff) and oldreward != 0): # restriCount[raw][col][action] += 1 # if restriCount[raw][col][action] < 3: print("!! restriction applied !!") gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.3) ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast reward[raw][col][action] = diff # update_reward.update_reward(reward, raw, col, action, diff) kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 try: alpha = 1 / ((kMatrix[raw][col][action])**bita) eComplement = reward[raw][col][action] + gama * max( nextstate) - Q[raw][col][action] e = reward[raw][col][action] + gama * max(nextstate) - max( Q[raw][col]) for r in range(size[0]): for c in range(size[1]): for actn in action_select(raw, col, n): Tr[r][c][actn] = gama * lamda * Tr[r][c][actn] Q[r][c][ actn] = Q[r][c][actn] + alpha * Tr[r][c][actn] * e Q[raw][col][action] = Q[raw][col][action] + alpha * eComplement Tr[raw][col][action] += 1 # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") print possibleActions print probablity print "raw= ", raw, "col = ", col, "action = ", action raw = rawtemp col = coltemp print "qerror is", qError(Q, Qlast) print "reward is", reward print "iteration = ", iteration # getting the appropriate action back from the given calculated values of Q matrix print Tr print Q for r in range(0, size[0]): for c in range(0, size[1]): possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration return Q, a, iteration
def qLearning(n, p, p1, encoder, ENClast): v = initvalact.initvalact(n) Q = qinitial.qinitial(n) kMatrix = qinitial.qinitial(n) restriCount = qinitial.qinitial(n) bita = 1/2 # Q = [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] a = v[1] # a = [[None, None, None], [None, None, None], [None, None, None]] # initializing action matrix # size = reading size of the given Q matrix [Nos of raws, Nos. of col, Nos. of actions possible per state] size = np.shape(Q) # storing size of Q-matrix n = size[0] Qlast = generateDummy(Q) # generating dummy of same sizq as Q to enter the while loop iteration = 0 # initializing the iteration reward = generate_rewardmatrix.generate_rewardmatrix(n) # check for the error value to be 10**-3 or Q = Qlast global val1 val1 = pinSetup.valueRead_ON() while (qError(Q, Qlast) > 1.5 or Q == Qlast or iteration <= 4*n) and (val1 == 0): # we want here Q!=Qlast becauses in starting phase if reward is zero in next step we will read error = 0 # and this will cause us to fall out of the loop iteration += 1 # incresing iteration value Qlast = deepcopy(Q) # copying Q to Qlast # state = selecting state randomly 1every time depending on the Q size state = random.randint(1, size[0] * size[1]) # temp = to retrive raw and column from Nos of state generated by random selector # state / Nos.of column will give us information about the raw number... # for 3x4 (raw x column) state 1 to 4 are raw in 1 and state 5 to 8 are raw in 2 # for raw1(state 1 to 4)/4 (total columns) will be 0 < temp <= 1 # for raw1(state 5 to 8)/4 (total columns) will be 1 < temp <= 2 temp = state / (size[1] * 1.0) # defining a temporary variable if ((temp).is_integer()): raw = int(temp) - 1 else: raw = int(temp) # temp = modulo of state and Total column # for column1(state 1,5,9) % 4 (total columns) will be 1 [i.e colum = 1-1 = 0] # for column1(state 2,6,10) % 4 (total columns) will be 2 [i.e colum = 2-1 = 1] temp = state % size[1] col = temp - 1 if col < 0: col = size[1] - 1 else: pass gotopos.gotopos(raw, col, p, p1, n) # to go to state that is selected randomly time.sleep(0.3) # ipdb.set_trace() NumOfSelAct = 100*(1-1/(np.exp(0.05*(iteration-2)))) NumOfSelAct = round(NumOfSelAct*25/100) for i in range(0, 20): # action selection according to selction of state if i < NumOfSelAct: possibleActions = action_select(raw, col, n) tempList = [] for j in possibleActions: tempList.append(Q[raw][col][j]) action = possibleActions[tempList.index(max(tempList))] print ("for i") , i, ("selected action is"), action else: possibleActions = action_select(raw, col, n) action = random.choice(possibleActions) # defining nextstate according to choosen action if action == 0: # Up movement nextstate = Q[raw-1][col] rawtemp = raw - 1 # raw of nextstep coltemp = col # col of nextstep elif action == 1: # Down movememt nextstate = Q[raw+1][col] rawtemp = raw + 1 # raw of nextstep coltemp = col # col of nextstep elif action == 2: # Left movement nextstate = Q[raw][col-1] rawtemp = raw # raw of nextstep coltemp = col - 1 # col of nextstep else: # Right movement # ipdb.set_trace() nextstate = Q[raw][col+1] rawtemp = raw # raw of nextstep coltemp = col + 1 # col of nextstep # ipdb.set_trace() # try executing the Q-iteration formula with no errors.. ''' _____ADD HERE____ ACTION_PERFORMANCE FUNCTION UPDATE_REWARD FUNCTION ''' ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() if ENC - ENClast > 0: diff = 1 if ENC - ENClast > 1: diff = 2 elif ENC - ENClast < 0: diff = -1 if ENC - ENClast < -1: diff = -2 else: diff = 0 oldreward = reward[raw][col][action] if (oldreward != 0 and diff == 0) or (np.sign(oldreward)!=np.sign(diff)): # restriCount[raw][col][action] += 1 # if restriCount[raw][col][action] < 3: print ("!! restriction applied !!") restriCount[raw][col][action] = 0 gotopos.gotopos(raw, col, p, p1, n) time.sleep(0.3) ENClast = encoder.getData() act.playAction(action, raw, col, size[0], p, p1) time.sleep(0.1) if action == 0 or action == 1: ENClast = encoder.getData() ENC = encoder.getData() diff = ENC - ENClast direction = pinSetup.valueRead_dir() reward[raw][col][action] = ((-1)**direction)*diff # update_reward.update_reward(reward, raw, col, action, diff) kMatrix[raw][col][action] = kMatrix[raw][col][action] + 1 try: alpha = 1/((kMatrix[raw][col][action])**bita) Q[raw][col][action] = (1-alpha) * Q[raw][col][action] + \ alpha * (reward[raw][col][action] + gama * (max(nextstate))) # print "Q", Q # tracking if there is a type error (i.e. datatype missmatch) or not in above equation except TypeError as e: print("TypeError") raw = rawtemp col = coltemp print "iteration is", iteration print "qerror is", qError(Q, Qlast) print "reward is", reward val1 = pinSetup.valueRead_ON() # getting the appropriate action back from the given calculated values of Q matrix if val1 == 1: #import os print "Stop" #os.system("shutdown now") for r in range(0, size[0]): for c in range(0, size[1]): # ipdb.set_trace() possibleActions = action_select(r, c, n) tempList = [] for i in possibleActions: tempList.append(Q[r][c][i]) a[r][c] = possibleActions[tempList.index(max(tempList))] # ipdb.set_trace() # function returns Q matrix, action matrix and nos of iteration print kMatrix # print NumOfSelAct return Q, a, iteration
pinVar = pinSetup.pinSetup() p = pinVar[0] p1 = pinVar[1] encoder = pinVar[2] ENClast = pinVar[3] i=1 ''' try: while True: ENC = encoder.getData() print ENC time.sleep(0.05) except KeyboardInterrupt: print('interrupted!') ''' p.start(4.0) p1.start(6.5) gotopos.gotopos(2,2,p,p1) encoder.setData(0) time.sleep(2.0) action_20.playAction(1, 1, 1, 3, p, p1) time.sleep(0.25) ENC = encoder.getData() print ENC action_20.playAction(2, 2, 1, 3, p, p1) time.sleep(0.25) ENC = encoder.getData() print ENC time.sleep(2.05)