def __init__(self, action_set, learning_ratio=0.01, gama=0.5, epsilon=0.01, load_from_file=False): self.action_set = action_set self.learning_ratio = learning_ratio self.gama = gama self.epsilon = epsilon self.filename = 'pong_agent_cmac' offsets = [8 * 1, 0, 8 * 3, 0, 0] dimensions = [ Dimension(tile_width=8, minn=0, maxx=48), Dimension(tile_width=1, minn=0, maxx=2), Dimension(tile_width=8, minn=0, maxx=64), Dimension(tile_width=1, minn=0, maxx=2), Dimension(tile_width=1, minn=0, maxx=3) ] self.q_func = CMAC(offsets=offsets, dimensions=dimensions, n_tilings=4) if load_from_file: self.q_func.load_from_file(self.filename)
def testCmacComputation(self): key = [ 0x12, 0x34, 0x56, 0x78, 0x12, 0x34, 0x56, 0x78, 0x12, 0x34, 0x56, 0x78, 0x12, 0x34, 0x56, 0x78 ] cmac = CMAC(key) cmac.update([0xde, 0xad, 0xbe, 0xef]) result = cmac.final() expectedResult = bytearray( [0xb5, 0xf3, 0xeb, 0x27, 0x15, 0x45, 0xe5, 0x55]) self.assertEqual(result, expectedResult)
def aesEncrypt(key, data, mode='CMAC'): """AES encryption function Args: key (str): packed 128 bit key data (str): packed plain text data mode (str): Optional mode specification (CMAC) Returns: Packed encrypted data string """ dataorder = 'big' keyorder = 'big' if mode == 'CMAC': cipher = CMAC() # there must be a better way to do this key = (int.from_bytes(key[0:4], 'big'), int.from_bytes(key[4:8], 'big'), int.from_bytes(key[8:12], 'big'), int.from_bytes(key[12:16], 'big')) if len(data) <= 16: length = len(data) * 8 data = data + bytearray((16 - len(data))) data = [(int.from_bytes(data[0:4], 'big'), int.from_bytes(data[4:8], 'big'), int.from_bytes(data[8:12], 'big'), int.from_bytes(data[12:16], 'big'))] elif len(data) > 16: length = (len(data) - 16) * 8 data = data + bytearray((32 - len(data))) data = [(int.from_bytes(data[0:4], 'big'), int.from_bytes(data[4:8], 'big'), int.from_bytes(data[8:12], 'big'), int.from_bytes(data[12:16], 'big')), (int.from_bytes(data[16:20], 'big'), int.from_bytes(data[20:24], 'big'), int.from_bytes(data[24:28], 'big'), int.from_bytes(data[28:32], 'big'))] else: print('Data greater than 32 bytes') mic = cipher.cmac(key, data, length) mic = mic[0].to_bytes(4, 'big') return mic else: try: cipher = AES.new(key, AES.MODE_ECB) except: cipher = AES(key, AES.MODE_ECB) return cipher.encrypt(data)
def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9, decayRate=0.9, seed=12345, cmac_level=20, cmac_quantization=0.3, cmac_beta=0.1, port=12345,serverPath = "/home/leno/HFO/bin/"): super(SARSA, self).__init__(seed, port,serverPath=serverPath) self.name = "SARSA" self.qTable = {} self.stateActionTrace = {} self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.decayRate = decayRate self.cmac = CMAC(cmac_level,cmac_quantization,cmac_beta)
def main(): print('New agent online!') print('..... Initializing learning algorithm: SARSA') ACTIONS = [MOVE, SHOOT, PASS_CLOSE, PASS_FAR, DRIBBLE] SARSA = SARSA(ACTIONS) print('..... Initializing discretization with CMAC') CMAC = CMAC(1,0.5,0.1) print('..... Loading HFO environment') hfo = HFOEnvironment() print('..... Connecting to HFO server') hfo.connectToServer(HIGH_LEVEL_FEATURE_SET, 'bin/teams/base/config/formations-dt', 6000, 'localhost', 'base_left', False) print('..... Start training') for episode in itertools.count(): print('..... Starting episode %d' % episode) status = IN_GAME step = 0 while status == IN_GAME: step += 1 old_status = status # Get the vector of state features for the current state features = hfo.getState() state = transformFeatures(features) print('State: %s' % str(state)) action = select_action(state) hfo.act(action) #print('Action: %s' % str(action)) # Advance the environment and get the game status status = hfo.step() #print('Status: %s' % str(status)) print('.......... Step %d: %s - %s - %s' % (step, str(old_status), str(action), str(status))) # Check the outcome of the episode print('..... Episode ended with %s'% hfo.statusToString(status)) # Quit if the server goes down if status == SERVER_DOWN: hfo.act(QUIT) break
def __init__(self, gen_factor, num_weights): CMAC.__init__(self, gen_factor, num_weights)
class PongAgent: def __init__(self, action_set, learning_ratio=0.01, gama=0.5, epsilon=0.01, load_from_file=False): self.action_set = action_set self.learning_ratio = learning_ratio self.gama = gama self.epsilon = epsilon self.filename = 'pong_agent_cmac' offsets = [8 * 1, 0, 8 * 3, 0, 0] dimensions = [ Dimension(tile_width=8, minn=0, maxx=48), Dimension(tile_width=1, minn=0, maxx=2), Dimension(tile_width=8, minn=0, maxx=64), Dimension(tile_width=1, minn=0, maxx=2), Dimension(tile_width=1, minn=0, maxx=3) ] self.q_func = CMAC(offsets=offsets, dimensions=dimensions, n_tilings=4) if load_from_file: self.q_func.load_from_file(self.filename) def __get_action_index(self, action): for x in range(0, len(self.action_set)): if action == self.action_set[x]: return x return None def __choose_best_action_index(self, state): best_state = PongState(state, 0) best_reward = self.q_func.get_weight(best_state) for a in range(1, len(self.action_set)): current_state = PongState(state, a) current_reward = self.q_func.get_weight(current_state) if current_reward > best_reward: best_state = current_state best_reward = current_reward return best_state.action_index def pick_action(self, state): best_action_index = self.__choose_best_action_index(state) if random() < self.epsilon: best_action_index = randint(0, len(self.action_set) - 1) return self.action_set[best_action_index] # TODO: Remake def update_q_function(self, state, action, reward, next_state): best_next_action_index = self.__choose_best_action_index(next_state) best_action_index = self.__get_action_index(action) pong_state = PongState(state, best_action_index) pong_next_state = PongState(next_state, best_next_action_index) now_q_func = self.q_func.get_weight(pong_state) next_q_func = self.q_func.get_weight(pong_next_state) expected_reward = reward + self.gama * next_q_func new_weight = now_q_func + self.learning_ratio * (expected_reward - now_q_func) self.q_func.set_weight(pong_state, new_weight) def save_to_file(self): self.q_func.save_to_file(self.filename)
def transformFeatures(features): ''' From continuous to discrete using CMAC ''' data = [] for feature in features: quantized_features = CMAC.quantize(feature) data.append([pts])
class SARSA(Agent): def __str__(self): """ Overwrites the object.__str__ method. Returns: string (str): Important parameters of the object. """ return "Agent: " + str(self.unum) + ", " + \ "Type: " + str(self.name) + ", " + \ "Training steps: " + str(self.training_steps_total) + ", " + \ "Q-Table size: " + str(len(self.qTable)) def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9, decayRate=0.9, seed=12345, cmac_level=20, cmac_quantization=0.3, cmac_beta=0.1, port=12345,serverPath = "/home/leno/HFO/bin/"): super(SARSA, self).__init__(seed, port,serverPath=serverPath) self.name = "SARSA" self.qTable = {} self.stateActionTrace = {} self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.decayRate = decayRate self.cmac = CMAC(cmac_level,cmac_quantization,cmac_beta) #print('***** %s: Agent uses CMAC(%s,%s,%s)' % (str(self.unum),str(cmac_level), str(cmac_quantization), str(cmac_beta))) def quantize_features(self, features): """ CMAC utilities for all agent """ quantVar = self.cmac.quantize(features) data = [] #len(quantVar[0]) is the number of variables for i in range(0,len(quantVar[0])): #Transforms n tuples into a single array for var in quantVar: #copy each tuple value to the output data.append(var[i]) #returns the output as a tuple return tuple(data) def advise_action(self,uNum,state): """Verifies if the agent can advice a friend, and return the action if possible""" return None #No advising def get_Q(self, state, action): return self.qTable.get((state, action), 0.0) def observe_reward(self,state,action,reward,statePrime): """ After executing an action, the agent is informed about the state-reward-state tuple """ pass ''' def observe_reward(self,state,action,reward,statePrime): """ After executing an action, the agent is informed about the state-action-reward-state tuple """ if self.exploring: #Selects the action for the next state without exploration lastState = self.lastState self.exploring = False nextAction = self.select_action(statePrime) #Hereafter the self.lastState refers to statePrime #Executes Q-update self.learn(lastState,action,reward,self.lastState,nextAction) #turns on the exploration again ''' def select_action(self, stateFeatures, state, noAdvice=False): """Executes the epsilon-greedy exploration strategy""" #stores last CMAC result #self.lastState = state # select applicable actions if stateFeatures[5] == 1: # State[5] is 1 when the player can kick the ball actions = [self.SHOOT, self.DRIBBLE, self.PASSfar, self.PASSnear] else: return self.MOVE # epsilon greedy action selection if self.exploring and random.random() < self.epsilon and not noAdvice: actionsRandom = [ self.SHOOT,self.DRIBBLE, self.DRIBBLE, self.SHOOT, self.PASSfar, self.PASSnear] return random.choice(actionsRandom) else: cmacState = self.quantize_features(state) qValues = [self.get_Q(cmacState, action) for action in actions] maxQ = max(qValues) count = qValues.count(maxQ) if count > 1: #and self.exploring: best = [i for i in range(len(actions)) if qValues[i] == maxQ] if not self.exploring: return actions[best[0]] return actions[random.choice(best)] else: return actions[qValues.index(maxQ)] def learn(self, state1, action1, reward, state2, action2): qnext = self.get_Q(state2, action2) self.learn_Q(state1, action1, reward, reward + self.gamma * qnext) def learn_Q(self, state, action, reward, value): oldv = self.qTable.get((state, action), None) if oldv is None: self.qTable[(state, action)] = reward else: self.qTable[(state, action)] = oldv + self.alpha * (value - oldv) def step(self, state, action): """ Perform a complete training step """ # perform action and observe reward & statePrime self.execute_action(action) status = self.hfo.step() stateFeatures = self.hfo.getState() statePrime = self.get_transformed_features(stateFeatures) stateQuantized = self.quantize_features(state) statePrimeQuantized = self.quantize_features(statePrime) reward = self.get_reward(status) # select actionPrime if self.exploring: actionPrime = self.select_action(stateFeatures, statePrime,False) else: actionPrime = self.select_action(stateFeatures, statePrime,True) if self.exploring: # calculate TDError TDError = reward + self.gamma * self.get_Q(statePrimeQuantized, actionPrime) - self.get_Q(stateQuantized, action) # update trace value self.stateActionTrace[(stateQuantized, action)] = self.stateActionTrace.get((stateQuantized, action), 0) + 1 for stateAction in self.stateActionTrace: # update update ALL Q values and eligibility trace values self.qTable[stateAction] = self.qTable.get(stateAction, 0) + TDError * self.alpha * self.stateActionTrace.get(stateAction, 0) # update eligibility trace Function for state and action self.stateActionTrace[stateAction] = self.gamma * self.decayRate * self.stateActionTrace.get(stateAction, 0) #self.learn(stateQuantized, action, reward, # statePrimeQuantized, actionPrime) self.training_steps_total += 1 if status != self.IN_GAME: self.stateActionTrace = {} return status, statePrime, actionPrime def setupAdvising(self,agentIndex,allAgents): """ This method is called in preparation for advising """ pass
#! /usr/bin/env python """ Author: Jeremy M. Stober Program: TEST_CMAC.PY Date: Saturday, April 14 2012 Description: Test code for CMAC implementation. """ from cmac import CMAC import numpy as np import numpy.random as npr import pdb import pylab from utils import dual_scatter, create_cluster_colors_rgb,lvl_scatter c = CMAC(1,0.5,0.1) # t = [[0.2,0.2],[.6,.6],[.2,.6],[.6,.2]] # for i in t: # print i, c.quantize(i), c.quantize_alt(i), c.quantize_fast(i) c = CMAC(3,0.15,0.1) #pdb.set_trace() #c.quantize_alt([0.05, 0.1]) data = [] for i in range(1000): t = npr.rand(2) * 2 - 1 pts = c.quantize(t)
from cmac import CMAC import numpy as np import numpy.random as npr c = CMAC(5, 0.1, 0.1) data = [0.5, 0.7, 0.1, 0, 1, 0.543] #for i in range(1): # t = npr.rand(10) * 4 - 2 # print(type(t)) # print("t[%d]: %s" % (i,str(t))) # pts = c.quantize(t) # print("pts[%d]: %s" % (i,str(pts))) #pts = c.quantize_alt(t) #print("pts[%d]: %s" % (i,str(pts))) #pts = c.quantize_fast(t) #print("pts[%d]: %s" % (i,str(pts))) # data.append([t,pts]) print data print c.quantize(data) #labels = [d[1][0] for d in data] #print len(set(labels))
class SARSA(Agent): def __str__(self): """ Overwrites the object.__str__ method. Returns: string (str): Important parameters of the object. """ return "Agent: " + str(self.unum) + ", " + \ "Type: " + str(self.name) + ", " + \ "Training steps: " + str(self.training_steps_total) + ", " + \ "Q-Table size: " + str(len(self.qTable)) def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9, decayRate=0.9, seed=12345, cmac_level=20, cmac_quantization=0.3, cmac_beta=0.1, port=12345,serverPath = "/home/leno/HFO/bin/"): super(SARSA, self).__init__(seed, port,serverPath=serverPath) self.name = "SARSA" self.qTable = {} self.stateActionTrace = {} self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.decayRate = decayRate self.cmac = CMAC(cmac_level,cmac_quantization,cmac_beta) #print('***** %s: Agent uses CMAC(%s,%s,%s)' % (str(self.unum),str(cmac_level), str(cmac_quantization), str(cmac_beta))) def quantize_features(self, features): """ CMAC utilities for all agent """ quantVar = self.cmac.quantize(features) # data = [] data = quantVar #len(quantVar[0]) is the number of variables # for i in range(0,len(quantVar[0])): #Transforms n tuples into a single array # for var in quantVar: #copy each tuple value to the output # data.append(var[i]) #returns the output as a tuple return tuple(data) def advise_action(self,uNum,state): """Verifies if the agent can advice a friend, and return the action if possible""" return None #No advising def get_Q(self, state, action): return self.qTable.get((state, action), 0.0) def observe_reward(self,state,action,reward,statePrime): """ After executing an action, the agent is informed about the state-reward-state tuple """ pass ''' def observe_reward(self,state,action,reward,statePrime): """ After executing an action, the agent is informed about the state-action-reward-state tuple """ if self.exploring: #Selects the action for the next state without exploration lastState = self.lastState self.exploring = False nextAction = self.select_action(statePrime) #Hereafter the self.lastState refers to statePrime #Executes Q-update self.learn(lastState,action,reward,self.lastState,nextAction) #turns on the exploration again ''' def select_action(self, stateFeatures, state, noAdvice=False): """Executes the epsilon-greedy exploration strategy""" #stores last CMAC result #self.lastState = state # select applicable actions if stateFeatures[5] == 1: # State[5] is 1 when the player can kick the ball actions = [self.SHOOT, self.DRIBBLE, self.PASSfar, self.PASSnear] else: return self.MOVE # epsilon greedy action selection if self.exploring and random.random() < self.epsilon and not noAdvice: actionsRandom = [ self.SHOOT,self.DRIBBLE, self.DRIBBLE, self.SHOOT, self.PASSfar, self.PASSnear] return random.choice(actionsRandom) else: cmacState = self.quantize_features(state) qValues = [self.get_Q(cmacState, action) for action in actions] maxQ = max(qValues) count = qValues.count(maxQ) if count > 1: #and self.exploring: best = [i for i in range(len(actions)) if qValues[i] == maxQ] if not self.exploring: return actions[best[0]] return actions[random.choice(best)] else: return actions[qValues.index(maxQ)] def learn(self, state1, action1, reward, state2, action2): qnext = self.get_Q(state2, action2) self.learn_Q(state1, action1, reward, reward + self.gamma * qnext) def learn_Q(self, state, action, reward, value): oldv = self.qTable.get((state, action), None) if oldv is None: self.qTable[(state, action)] = reward else: self.qTable[(state, action)] = oldv + self.alpha * (value - oldv) def step(self, state, action): """ Perform a complete training step """ # perform action and observe reward & statePrime self.execute_action(action) status = self.hfo.step() stateFeatures = self.hfo.getState() statePrime = self.get_transformed_features(stateFeatures) stateQuantized = self.quantize_features(state) statePrimeQuantized = self.quantize_features(statePrime) reward = self.get_reward(status) # select actionPrime if self.exploring: actionPrime = self.select_action(stateFeatures, statePrime,False) else: actionPrime = self.select_action(stateFeatures, statePrime,True) if self.exploring: # calculate TDError TDError = reward + self.gamma * self.get_Q(statePrimeQuantized, actionPrime) - self.get_Q(stateQuantized, action) # update trace value self.stateActionTrace[(stateQuantized, action)] = self.stateActionTrace.get((stateQuantized, action), 0) + 1 for stateAction in self.stateActionTrace: # update update ALL Q values and eligibility trace values self.qTable[stateAction] = self.qTable.get(stateAction, 0) + TDError * self.alpha * self.stateActionTrace.get(stateAction, 0) # update eligibility trace Function for state and action self.stateActionTrace[stateAction] = self.gamma * self.decayRate * self.stateActionTrace.get(stateAction, 0) #self.learn(stateQuantized, action, reward, # statePrimeQuantized, actionPrime) self.training_steps_total += 1 if status != self.IN_GAME: self.stateActionTrace = {} return status, statePrime, actionPrime def setupAdvising(self,agentIndex,allAgents): """ This method is called in preparation for advising """ pass